In [1]:
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings("ignore")

# Import functions

In [2]:
import ipynb
from ipynb.fs.defs.ML_Comparison_API import *

# Read CSV files

In [3]:
diabetesData = read_csv('Data/diabetes.csv')
wineData = read_csv('Data/winequality.csv', sep=';')

data = load_diabetes(as_frame=True)
frame = data.frame

In [4]:
youngerDiabetes = diabetesData[diabetesData['Age'] < diabetesData['Age'].median()]
olderDiabetes = diabetesData[diabetesData['Age'] >= diabetesData['Age'].median()]

lessPH = wineData[wineData['pH'] < wineData['pH'].median()]
morePH = wineData[wineData['pH'] >= wineData['pH'].median()]

menDiabetes = frame[frame['sex'] < 0]
womenDiabetes = frame[frame['sex'] >= 0]

In [5]:
diabetes_target = ['Outcome']
diabetes_features = diabetesData.columns.difference(diabetes_target)

# data of X (predictors)
younger_X = youngerDiabetes[diabetes_features]
older_X = olderDiabetes[diabetes_features]
# data of Y (target)
younger_y = youngerDiabetes[diabetes_target]
older_y = olderDiabetes[diabetes_target]

# split to training data, testing data
younger_X_train, younger_X_test, younger_y_train, younger_y_test = train_test_split(younger_X,
                                                                                    younger_y,
                                                                                    random_state=1,
                                                                                    stratify=younger_y)

older_X_train, older_X_test, older_y_train, older_y_test = train_test_split(older_X,
                                                                            older_y,
                                                                            random_state=1,
                                                                            stratify=older_y)

In [6]:
wine_target = ['quality']
wine_features = wineData.columns.difference(wine_target)

# data of X (predictors)
less_X = lessPH[wine_features]
more_X = morePH[wine_features]
# data of Y (target)
less_y = lessPH[wine_target]
more_y = morePH[wine_target]

less_X_train, less_X_test, less_y_train, less_y_test = train_test_split(less_X,
                                                                        less_y,
                                                                        random_state=1,
                                                                        stratify=less_y)
more_X_train, more_X_test, more_y_train, more_y_test = train_test_split(more_X,
                                                                        more_y,
                                                                        random_state=1,
                                                                        stratify=more_y)

In [7]:
frame_X = data.data
frame_y = data.target
frame_features = data.feature_names

frame_target = ['target']

# data of X (predictors)
men_X = menDiabetes[frame_features]
women_X = womenDiabetes[frame_features]
# data of Y (target)
men_y = menDiabetes[frame_target]
women_y = womenDiabetes[frame_target]

men_X_train, men_X_test, men_y_train, men_y_test = train_test_split(men_X,
                                                                    men_y,
                                                                    random_state=1)
women_X_train, women_X_test, women_y_train, women_y_test = train_test_split(women_X,
                                                                            women_y,
                                                                            random_state=1)

In [8]:
# save back to csv files
to_csv(younger_X_train, "Data/younger_diabetes_features_train.csv")
to_csv(younger_y_train, "Data/younger_diabetes_target_train.csv")
to_csv(younger_X_test, "Data/younger_diabetes_features_test.csv")
to_csv(younger_y_test, "Data/younger_diabetes_target_test.csv")

to_csv(older_X_train, "Data/older_diabetes_features_train.csv")
to_csv(older_y_train, "Data/older_diabetes_target_train.csv")
to_csv(older_X_test, "Data/older_diabetes_features_test.csv")
to_csv(older_y_test, "Data/older_diabetes_target_test.csv")

to_csv(less_X_train, "Data/less_PH_features_train.csv")
to_csv(less_y_train, "Data/less_PH_target_train.csv")
to_csv(less_X_test, "Data/less_PH_features_test.csv")
to_csv(less_y_test, "Data/less_PH_target_test.csv")

to_csv(more_X_train, "Data/more_PH_features_train.csv")
to_csv(more_y_train, "Data/more_PH_target_train.csv")
to_csv(more_X_test, "Data/more_PH_features_test.csv")
to_csv(more_y_test, "Data/more_PH_target_test.csv")

to_csv(men_X_train, "Data/men_diabetes_features_train.csv")
to_csv(men_y_train, "Data/men_diabetes_target_test.csv")
to_csv(men_X_test, "Data/men_diabetes_features_train.csv")
to_csv(men_y_test, "Data/men_diabetes_target_test.csv")

to_csv(women_X_train, "Data/women_diabetes_features_train.csv")
to_csv(women_y_train, "Data/women_diabetes_target_test.csv")
to_csv(women_X_test, "Data/women_diabetes_features_train.csv")
to_csv(women_y_test, "Data/women_diabetes_target_test.csv")

# Binary classification

In [9]:
diabetesData.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [10]:
# for checking any missing value
# False means there is not missing value in this database
diabetesData.isnull().values.any()

False

In [11]:
# specify decision tree model
younger_decisionTree_model = create_decision_tree()
older_decisionTree_model = create_decision_tree()

# train(fit) decision tree classifier
fit_decision_tree(younger_X_train.values, younger_y_train, younger_decisionTree_model)
fit_decision_tree(older_X_train.values, older_y_train, older_decisionTree_model)

# save trained classifiers in file using Pickle
save_ML_model('ML_Models/younger_DecisionTreeClassifier', younger_decisionTree_model)
save_ML_model('ML_Models/older_DecisionTreeClassifier', older_decisionTree_model)

# Multi-Class Classification

In [12]:
wineData.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [13]:
# for checking any missing value
wineData.isnull().values.any()

False

In [14]:
target = ['quality']
features = wineData.columns.difference(target)

In [15]:
# classes
np.unique(wineData[target])

array([3, 4, 5, 6, 7, 8, 9])

In [16]:
number_of_classes = len(np.unique(wineData[target]))

# create KNN classifiers
less_knn = KNeighborsClassifier(n_neighbors=number_of_classes)
more_knn = KNeighborsClassifier(n_neighbors=number_of_classes)

less_knn.fit(less_X_train.values, less_y_train.values.ravel())
more_knn.fit(more_X_train.values, more_y_train.values.ravel())

# save trained classifiers in file using Pickle
save_ML_model('ML_Models/less_KNeighborsClassifier', less_knn)
save_ML_model('ML_Models/more_KNeighborsClassifier', more_knn)

# Regression

In [17]:
# create linear regression object
men_regression = LinearRegression()
women_regression = LinearRegression()

# train the model using the training sets
men_regression.fit(men_X_train, men_y_train)
women_regression.fit(women_X_train, women_y_train)

save_ML_model('ML_Models/men_LinearRegression', men_regression)
save_ML_model('ML_Models/women_LinearRegression', women_regression)