In [2]:
import pandas as pd
from typing import Tuple
from scipy.stats import mode
from sklearn.metrics import confusion_matrix
import seaborn as sns
path = "diabetes.csv"

In [23]:
def load_csv(path : str) -> Tuple[pd.core.frame.DataFrame, pd.core.frame.DataFrame]:
    dataset = pd.read_csv(path)
    
    dataset = dataset.sample(frac=1, random_state=42)
    
    x = dataset.iloc[:, :-1]
    y = dataset.iloc[:, -1]
    
    return x, y

In [24]:
x, y = load_csv(path)
x, y

(     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
 668            6       98             58             33      190  34.0   
 324            2      112             75             32        0  35.7   
 624            2      108             64              0        0  30.8   
 690            8      107             80              0        0  24.6   
 473            7      136             90              0        0  29.9   
 ..           ...      ...            ...            ...      ...   ...   
 71             5      139             64             35      140  28.6   
 106            1       96            122              0        0  22.4   
 270           10      101             86             37        0  45.6   
 435            0      141              0              0        0  42.4   
 102            0      125             96              0        0  22.5   
 
      DiabetesPedigreeFunction  Age  
 668                     0.430   43  
 324                  

In [26]:
x.mean(axis = 0, skipna=False), x.var(axis = 0, skipna=False)

(Pregnancies                   3.845052
 Glucose                     120.894531
 BloodPressure                69.105469
 SkinThickness                20.536458
 Insulin                      79.799479
 BMI                          31.992578
 DiabetesPedigreeFunction      0.471876
 Age                          33.240885
 dtype: float64,
 Pregnancies                    11.354056
 Glucose                      1022.248314
 BloodPressure                 374.647271
 SkinThickness                 254.473245
 Insulin                     13281.180078
 BMI                            62.159984
 DiabetesPedigreeFunction        0.109779
 Age                           138.303046
 dtype: float64)

In [27]:
x.mean(axis = 0), x.var(axis = 0)

(Pregnancies                   3.845052
 Glucose                     120.894531
 BloodPressure                69.105469
 SkinThickness                20.536458
 Insulin                      79.799479
 BMI                          31.992578
 DiabetesPedigreeFunction      0.471876
 Age                          33.240885
 dtype: float64,
 Pregnancies                    11.354056
 Glucose                      1022.248314
 BloodPressure                 374.647271
 SkinThickness                 254.473245
 Insulin                     13281.180078
 BMI                            62.159984
 DiabetesPedigreeFunction        0.109779
 Age                           138.303046
 dtype: float64)

In [30]:
def train_test_split(features, labels, test_split_ratio: float):
    #test_size = int(len(features) * test_split_ratio)
    #train_size = len(features) - test_size
    #assert len(features) == test_size + train_size, "Size missmatch!!!"

    #x_train, y_train = features[:train_size, :], labels[:train_size]
    #x_test, y_test = features[train_size:, :], labels[train_size:]
    #return(x_train, y_train, x_test, y_test)

    test_size = int(len(features) * test_split_ratio)
    train_size = len(features) - test_size

    assert len(features) == test_size + train_size, "Size missmatch!!!"
    
    x_train, x_test = features.iloc[:-train_size], features.iloc[-train_size:]
    y_train, y_test = labels.iloc[:-train_size], labels.iloc[-train_size:]
    
    return x_train, y_train, x_test, y_test

In [31]:
x_train, y_train, x_test, y_test = train_test_split(x, y, 0.2)
x_train, y_train, x_test, y_test

(     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
 668            6       98             58             33      190  34.0   
 324            2      112             75             32        0  35.7   
 624            2      108             64              0        0  30.8   
 690            8      107             80              0        0  24.6   
 473            7      136             90              0        0  29.9   
 ..           ...      ...            ...            ...      ...   ...   
 725            4      112             78             40        0  39.4   
 355            9      165             88              0        0  30.4   
 534            1       77             56             30       56  33.3   
 344            8       95             72              0        0  36.8   
 296            2      146             70             38      360  28.0   
 
      DiabetesPedigreeFunction  Age  
 668                     0.430   43  
 324                  