Below two cells from Kaggle notebook: https://www.kaggle.com/code/scratchpad/notebooka341afdf51/edit

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # for data visualization
import matplotlib.pyplot as plt # to plot charts
from collections import Counter
import os

In [2]:
# Modeling
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve, train_test_split

# Loading the pima indian dataset 


In [3]:
#loading pima dataset
pima_diabetes = pd.read_csv('../data/diabetes.csv')

In [4]:
len(pima_diabetes)

768

# Data exploration and imputation (that is setting zero values to median)

In [5]:
pima_diabetes.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [6]:
#before counts
for col in pima_diabetes.columns:
    print(pima_diabetes[col].value_counts())


1     135
0     111
2     103
3      75
4      68
5      57
6      50
7      45
8      38
9      28
10     24
11     11
13     10
12      9
14      2
15      1
17      1
Name: Pregnancies, dtype: int64
99     17
100    17
111    14
129    14
125    14
       ..
191     1
177     1
44      1
62      1
190     1
Name: Glucose, Length: 136, dtype: int64
70     57
74     52
78     45
68     45
72     44
64     43
80     40
76     39
60     37
0      35
62     34
66     30
82     30
88     25
84     23
90     22
86     21
58     21
50     13
56     12
52     11
54     11
75      8
92      8
65      7
85      6
94      6
48      5
96      4
44      4
100     3
106     3
98      3
110     3
55      2
108     2
104     2
46      2
30      2
122     1
95      1
102     1
61      1
24      1
38      1
40      1
114     1
Name: BloodPressure, dtype: int64
0     227
32     31
30     27
27     23
23     22
33     20
28     20
18     20
31     19
19     18
39     18
29     17
40     16
25     16
26 

In [7]:
non_zero_columns = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'Age']

In [8]:
len(non_zero_columns)

6

In [None]:
#replace zero values with median of column
for col in non_zero_columns:
    pima_diabetes[col] = pima_diabetes[col].replace(0,pima_diabetes[col].median())

In [None]:
#after counts - no zeroes should be present

#before counts
for col in pima_diabetes.columns:
    print(pima_diabetes[col].value_counts())


In [None]:
#dropping unnamed columns
pima_diabetes.drop(pima_diabetes.columns[pima_diabetes.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)


In [None]:
pima_diabetes.to_csv('../data/diabetes_val_corrected.csv')

# Run these set of cells to generate breakdown of splits across hospitals 

In [None]:
pima_diabetes = pd.read_csv('../data/diabetes_val_corrected.csv', index_col=0)

In [None]:
pima_diabetes.columns


In [None]:
def get_across_hospital_split(dataset, no_hospital):
    '''
    Split dataset across hospitals evenly and randomly
    '''    
    #dictionary to hold records across each hospital
    hospitals = {}
        
    shuffled = dataset.sample(frac=1)
    result = np.array_split(shuffled, no_hospital) 
    ctr = 1
    
    for part in result:
        hospitals['hospital' + str(ctr)] = part
        ctr += 1
    
    return hospitals

In [None]:
def save_hospital_split_to_file(split_data, hospital_no):
    split_data.to_csv('../data/' + hospital_no + '.csv')

In [None]:
pima_split = get_across_hospital_split(pima_diabetes, 5)

for hospital in pima_split.keys():
    save_hospital_split_to_file(pima_split[hospital], hospital)

All data is always written to the ../data folder and is named either via the hospital_x.csv or if it is the full file - it is named as diabetes.csv

# Below code onwards is in MachineLearningComp -  Train model with different classifiers and report accuracy 

Data Pre-processing

In [None]:
from sklearn.preprocessing import QuantileTransformer

In [None]:
q  = QuantileTransformer()
X = q.fit_transform(pima_diabetes)
transformedDF = q.transform(X)
transformedDF = pd.DataFrame(X)
transformedDF.columns = pima_diabetes.columns

In [None]:
pima_diabetes

In [None]:
transformedDF

## creating a train and test dataset with the transformed data 

In [None]:
from sklearn.metrics import classification_report

In [None]:
## Separate train dataset and test dataset
def generate_train_test_split(transformedDF, test_size):
    features = transformedDF.drop(["Outcome"], axis=1)
    labels = transformedDF["Outcome"]
    x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.30, random_state=7)
    return x_train, x_test, y_train, y_test

In [None]:
def evaluate_model(models, x_train, y_train):
    """
    Takes a list of models and returns chart of cross validation scores using mean accuracy
    """
    
    # Cross validate model with Kfold stratified cross val
    kfold = StratifiedKFold(n_splits = 10)
    
    result = []
    for model in models :
        model_cv = cross_val_score(estimator = model[1], X = x_train, y = y_train, scoring = "accuracy", 
                                      cv = kfold, n_jobs=4)
        result.append(model_cv)

    cv_means = []
    cv_std = []
    for cv_result in result:
        #print(cv_result)
        cv_means.append(cv_result.mean())
        cv_std.append(cv_result.std())

    result_df = pd.DataFrame({
        "CrossValMeans":cv_means,
        "CrossValerrors": cv_std,
        "Models":[m[0] for m in models]
    })

    # Generate chart
    bar = sns.barplot(x = "CrossValMeans", y = "Models", data = result_df, orient = "h")
    bar.set_xlabel("Mean Accuracy")
    bar.set_title("Cross validation scores")
    return result_df

In [None]:
def fit_and_predict_models(model, x_train, y_train, x_test, y_test):
    '''
    Function to train a model, test it and report F1, precision and recall on the test predictions
    '''
    model[1].fit(x_train, y_train)
    y_pred_model = model[1].predict(x_test)
    print('Model : ' + model[0])
    class_report = classification_report(y_test, y_pred_model, output_dict=True)   
    #converting report to dataframe
    class_report = pd.DataFrame(class_report).T
    class_report = class_report.set_axis(class_report.columns, axis=1, inplace=False).rename_axis('dimensions',axis=0)
    class_report.reset_index(inplace=True)
    #printing and returning report
    print(class_report)
    return class_report

# Training and test on models - i.e., calling all before defined functions

In [None]:
random_state = 30
models = [
    ['Logistic Regression', LogisticRegression(random_state = random_state, solver='liblinear')],
    ['Decision Tree',DecisionTreeClassifier(random_state = random_state)],
    ['Random Forest', RandomForestClassifier(random_state = random_state)],
]

x_train, x_test, y_train, y_test = generate_train_test_split(transformedDF, 0.30)
evaluate_model(models, x_train, y_train)
model_reps = {}

for model in models:
    class_report = fit_and_predict_models(model, x_train, y_train, x_test, y_test)
    model_reps[model[0]] = class_report


In [None]:
model_reps['Logistic Regression'][model_reps['Logistic Regression']['dimensions'] == 'weighted avg']