<a href="https://colab.research.google.com/github/ChetanAIML/Team5/blob/main/ML_FinalProject_Team5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Machine Learning Fundamentals Final Project
Chetan Shah, Nathan Edwards, Kayla Wright

## Importing Libraries/Dataset

In [69]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
import warnings
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

collab = 1
try:
    from google.colab import drive
except:
    collab = 0

if(collab):
    from google.colab import drive
    drive.mount("/content/drive", force_remount=True)

warnings.filterwarnings('ignore')

In [68]:
 # Kayla
try:
    df = pd.read_csv('/content/drive/MyDrive/diabetes_prediction_dataset.csv')
except:
    pass

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/diabetes_prediction_dataset.csv'

In [70]:
if(collab):
    
    drive.mount("/content/drive", force_remount=True)
    df=pd.read_csv("/content/drive/Shareddrives/Project/ML-Team5/diabetes_prediction_dataset.csv")

In [71]:
if(not collab):
    df=pd.read_csv("diabetes_prediction_dataset.csv")

## Sanity Checks
Checking info, unique values, fixing features, value counts, checking duplicates

In [72]:
# Checking if dataset loaded properly
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


There are no missing values in this dataset. We have 9 columns with 100,000 rows. It does not look like any of the values have an incorrect datatype.

In [74]:
# Checking unique values for this dataset
df.nunique()

gender                    3
age                     102
hypertension              2
heart_disease             2
smoking_history           6
bmi                    4247
HbA1c_level              18
blood_glucose_level      18
diabetes                  2
dtype: int64

I wanted to see the number of unique values are in the smoking history feature. I want to investigate this further.

In [75]:
# Creating a list for the columns that I want to investigate
columns = ['gender', 'smoking_history']
# Getting value counts
for i in columns:
  print(df[i].value_counts())
  print('*' * 25)

Female    58552
Male      41430
Other        18
Name: gender, dtype: int64
*************************
No Info        35816
never          35095
former          9352
current         9286
not current     6447
ever            4004
Name: smoking_history, dtype: int64
*************************


It looks like there is a value called 'ever' which in my opinion is a missclassification of 'never' so I want to fix that before going any further.

In [None]:
# Replacing ever to never
df.smoking_history = df.smoking_history.replace('ever', 'never')

In [None]:
# Checking value counts once more
df.smoking_history.value_counts()

In [None]:
# Checking statistical info for this dataset
df.describe()

Mean age is 41 but the lowest age is less than a year old. This must be investigated graphically. I suspect either the ages could be wrong or very young patients were considered in this study.

In [None]:
duplicate = df[df.duplicated()]
duplicate

In [None]:
df = df.drop_duplicates()

In [None]:
duplicate = df[df.duplicated()]
duplicate

In [None]:
#@title EDA

numerical_columns = [col for col in df.select_dtypes(['float', 'int']) if col not in ['heart_disease', 'hypertension', 'diabetes']]
categorical_columns = [col for col in df.columns if col not in numerical_columns]

In [None]:
#@title Exploring categorical variables

for col in categorical_columns:
    print(f'{col:-<20} {df[col].unique()}')


for col in categorical_columns:
    print(col)
    print(df[col].value_counts())
    print('-'*20)

## Univariate Analysis

In [None]:
def boxplot_histogram(data, feature, title, figsize = (12,10), kde = False, bins = None):

  # This function makes a boxplot and a histogram
  # data: dataframe, feature: column of dataframe, figsize: size of figure
  #kde: show curve density, bins: number of bins for the hist plot

  # Setting up figure size, rows
  f2, (ax_box2, ax_hist2) = plt.subplots(nrows = 2, sharex = True, gridspec_kw = {"height_ratios":(0.25, 0.75)}, figsize = figsize)
  # Creating boxplot
  sns.boxplot(data = data, x = feature, ax = ax_box2, showmeans = True, palette = "winter")
  # Creating histplot
  sns.histplot(
    data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette="violet"
    ) if bins else sns.histplot(data=data, x=feature, kde=kde, ax=ax_hist2)
  # Putting mean on histogram
  ax_hist2.axvline(data[feature].mean(), color = "black", linestyle = "--")
  plt.title(title)
  # Putting median to histogram
  ax_hist2.axvline(data[feature].median(), color = "green", linestyle = "-")

In [None]:
def stacked_barplot(data, feature, target, title):

# This function makes a stacked barplot
# Data: dataframe, feature: independent feature variable, target: target variable

    # Getting unique values
    count = data[feature].nunique()
    # Sorting values
    sorter = data[target].value_counts().index[-1]
    # Make a crosstab of the factors
    one = pd.crosstab(data[feature], data[target], margins=True).sort_values(
        by=sorter, ascending=False)
    # Print the crosstab
    print(one)
    print("*" * 120)
    # Make another crosstab of the factors
    two = pd.crosstab(data[feature], data[target], normalize="index").sort_values(
        by=sorter, ascending=False)
    # Plot the graph
    two.plot(kind="bar", stacked=True, figsize=(count + 1, 5))
    plt.legend(loc="lower left", frameon=False,)
    plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
    plt.title(title)
    plt.show()

In [None]:
boxplot_histogram(df, "age", "Age Distribution")

There seem to be a lot of people around the age 70 - 80. Otherwise it is a fairly good distribution. No outliers seem to be here.

In [None]:
boxplot_histogram(df, "bmi", "BMI Distribution")

BMI is a fairly good distribution. There are very high BMI values but I reasearched this and there in fact has been at least one patient with a BMI over 80. This is entirely possible but quite deadly. I will leave these outliers as is. They may provide more insight to the model rather than hinder it.

In [None]:
boxplot_histogram(df, "blood_glucose_level", "Blood Glucose Level Distribution")

There are only 18 unique values for this column so I am not suprised to see a graph like this. There do not seem to be crazy outliers.

In [None]:
boxplot_histogram(df, "HbA1c_level", "HbA1c_level Level Distribution")

I do not see any outliers that are present for this feature. There are also only 18 different unique values so I am not surprised to see this.

In [None]:
#@title Categorical Counts
plt.figure(figsize=(20, 8))
i = 1
for col in categorical_columns:
    plt.subplot(2, 3, i)
    sns.countplot(data=df, x=df[col])
    i+=1

There are more females than males in this dataset. Much more people have hypertension than not in this dataset, which is also true for heart disease. The most common smoking types are never or no informantion. For our target variable I can see a massive class imbalance that could possibly be an issue for our models.

## Multivariate Analysis

In [None]:
stacked_barplot(df, "smoking_history", "diabetes", "Smoking History and Diabetes")

Former smokers are the most likely to develop diabetes. This makes sense because smoking for a long period of time can cause many, many health issues.

In [None]:
stacked_barplot(df, "gender", "diabetes", "Gender and Diabetes")

Males in this dataset are slightly more likely to have diabetes than women in this dataset.

In [None]:
stacked_barplot(df, "hypertension", "diabetes", "Hypertension and Diabetes")

People with hypertension are much more likely to have diabetes than without.

In [None]:
stacked_barplot(df, "heart_disease", "diabetes", "Heart Disease and Diabetes")

People with heart disease are very much more likely to have diabetes than not.

In [None]:
stacked_barplot(df, "HbA1c_level", "diabetes", "HBA1c Level and Diabetes")

HBA1c is glycated hemoglobin which is chemically linked to sugar. The test for HBA1c shows how much blood suger on average a person has had for the past couple months. It seems to me that once the HBA1c level gets as high as 6.5, then diabetes can occur. Once a patient reaches 6.8 or higher it becomes a 100% chance the patient will have diabetes in this dataset.

In [None]:
stacked_barplot(df, "blood_glucose_level", "diabetes", "Blood Glucose Level and Diabetes")

For blood glucose level in this dataset, it seems that a blood glucose level of 130 or more can correlate with diabetes. Any level higher than 220 is a 100% chance of having diabetes. This makes sense because diabetes makes it harder for the human body to break down sugars.

In [None]:
#@title Target/Numerical Variable Relationship

diabetes_yes = df[df['diabetes'] == 1]
diabetes_no = df[df['diabetes'] == 0]

plt.figure(figsize=(15, 8))
i=1
for col in numerical_columns:
    plt.subplot(2, 2, i)
    sns.distplot(diabetes_yes[col], label='Diabetes')
    sns.distplot(diabetes_no[col], label='No diabetes')
    plt.legend()
    i+=1

Older people are much more likely to have diabetes. People that weigh more are also more likely to have diabetes it seems. As found previously, people are more likely to have diabetes with higher HbA1c and glucose levels.

In [None]:
#@title Correlation between variables
sns.heatmap(df.corr(), annot=True)

The variables that correlate with having diabetes the most are blood glucose levels (0.42) and HbA1c level (0.41). Age is also a small correlation (0.27) with hypertension right behind it (0.20), along with BMI (0.21). I do not see heavily correlated variables amongst the other features that we need to address.

In [None]:
#@title Target/Categorical Variable Relationship

plt.figure(figsize=(15, 10))
i = 1
for col in categorical_columns:
    if col != 'diabetes':
        plt.subplot(2, 2, i)
        sns.heatmap(pd.crosstab(df[col], df['diabetes']), fmt='2', annot=True)
        i+=1

# Handeling Outliers and Categorical data

In [None]:
one_hot = pd.get_dummies(df['smoking_history'])
df = df.drop('smoking_history',axis = 1)
df = df.join(one_hot)

one_hot = pd.get_dummies(df['gender'])
df = df.drop('gender',axis = 1)
df = df.join(one_hot)

df.head()

In [None]:
y = df.diabetes
count_0 = 0
count_1 = 0
for i in y:
  if i==1:
    count_1+=1
  else:
    count_0+=1
plt.pie([count_0,count_1], labels = ["No Diabetes", "Diabetes"])
plt.show()

In [None]:
#Going to make Train and test set before removing outliers because I want them in test set so I need to do before splitting X and y.
msk = np.random.rand(len(df)) < 0.7
train = df[msk]
test = df[~msk]
print(len(train), len(test))

In [None]:
# Deletes data that is 3.5 std above or below.
train_df_no_outliers = (train[(np.abs(stats.zscore(df)) < 3.5).all(axis=1)])
train_df_with_outliers = df

In [None]:
X_train_outliers = train_df_with_outliers
y_train_outliers = X_train_outliers.diabetes
X_train_outliers = X_train_outliers.drop(columns="diabetes")

X_train_no_outliers = train_df_no_outliers
y_train_no_outliers = train_df_no_outliers.diabetes
X_train_no_outliers = X_train_no_outliers.drop(columns="diabetes")

In [None]:
y_test = test.diabetes
X_test = test
X_test = X_test.drop(columns="diabetes")

In [None]:
print(len(X_train_no_outliers),len(X_train_outliers))

In [None]:
#Takes in a list of predicated values and actual and makes a confused matrix
def Confusion_Matrix(predicted, actual):
    confusion_matrix = metrics.confusion_matrix(actual, predicted)
    cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True])
    cm_display.plot()
    plt.show()


In [None]:
def Calculate_Model_Scores(predicted, actual):
    print("The model had a accuracy socre of : ", round(accuracy_score(predicted, actual),4))
    print("The model had a precision socre of : ", round(precision_score(predicted, actual),4))
    print("The model had a recall socre of : ", round(recall_score(predicted, actual),4))
    print("The model had a f1 socre of : ", round(f1_score(predicted, actual),4))

## Model 1 - Decision Tree

### Testing With Outliers (not trimmed tree)

In [None]:
model1 = DecisionTreeClassifier(criterion="gini", random_state=1) # CLassifier object
model1.fit(X_train_outliers, y_train_outliers) # Classifier
test_pred = model1.predict(X_test)
Calculate_Model_Scores(test_pred,y_test)
Confusion_Matrix(test_pred,y_test)

In [65]:
f_names=list(X_test.columns) # Names of features
tree.plot_tree(model1, filled=True, feature_names=f_names) # Plotting
plt.show()

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "E:\Ana\lib\site-packages\IPython\core\interactiveshell.py", line 3444, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\beand\AppData\Local\Temp/ipykernel_72496/2578560331.py", line 3, in <module>
    plt.show()
  File "E:\Ana\lib\site-packages\matplotlib\pyplot.py", line 378, in show
    return _backend_mod.show(*args, **kwargs)
  File "E:\Ana\lib\site-packages\matplotlib_inline\backend_inline.py", line 41, in show
    display(
  File "E:\Ana\lib\site-packages\IPython\core\display.py", line 320, in display
    format_dict, md_dict = format(obj, include=include, exclude=exclude)
  File "E:\Ana\lib\site-packages\IPython\core\formatters.py", line 180, in format
    data = formatter(obj)
  File "E:\Ana\lib\site-packages\decorator.py", line 232, in fun
    return caller(func, *(extras + args), **kw)
  File "E:\Ana\lib\site-packages\IPython\core\formatters.py", line 224, in catch_format_error
    r = method(self, *

TypeError: object of type 'NoneType' has no len()

### Testing With Outliers (trimmed tree)

In [None]:
model2 = tree.DecisionTreeClassifier(max_depth=10, min_samples_split=100, min_samples_leaf=20, criterion="gini")
model2.fit(X_train_outliers, y_train_outliers) # Classifier
test_pred = model2.predict(X_test)
Calculate_Model_Scores(test_pred,y_test)
Confusion_Matrix(test_pred,y_test)

In [None]:
f_names=list(X_test.columns) # Features
tree.plot_tree(model2, filled=True, feature_names=f_names) # Plot using tree
plt.show()

### Testing Without Outliers (untrimmed tree)

In [None]:
model3 = DecisionTreeClassifier(criterion="gini", random_state=1) # CLassifier object
model3.fit(X_train_no_outliers, y_train_no_outliers) # Classifier
test_pred = model3.predict(X_test)
Calculate_Model_Scores(test_pred,y_test)
Confusion_Matrix(test_pred,y_test)

In [None]:
f_names=list(X_test.columns) # Names of features
tree.plot_tree(model3, filled=True, feature_names=f_names) # Plotting
plt.show()

### Testing Without Outliers (trimmed tree)

In [None]:
model4 = tree.DecisionTreeClassifier(max_depth=10, min_samples_split=100, min_samples_leaf=20, criterion="gini")
model4.fit(X_train_no_outliers, y_train_no_outliers) # Classifier
test_pred = model4.predict(X_test)
Calculate_Model_Scores(test_pred,y_test)
Confusion_Matrix(test_pred,y_test)

In [None]:
f_names=list(X_test.columns) # Features
tree.plot_tree(model4, filled=True, feature_names=f_names) # Plot using tree
plt.show()

## Model 2 - Random Forest

### Testing with Outliers

In [None]:
rf1 = RandomForestClassifier(random_state=0) # Random forest classifier object
rf1.fit(X_train_outliers, y_train_outliers) # Training it
test_pred = rf1.predict(X_test)
Calculate_Model_Scores(test_pred,y_test)
Confusion_Matrix(test_pred,y_test)

In [None]:
feature_names = list(X_train_outliers.columns)
print(feature_names)

In [None]:
importances = rf1.feature_importances_
indices = np.argsort(importances) # Sorting importances

plt.figure(figsize=(20, 100)) # Making a large feature
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], align="center") # Making the graph look presentable with a bar
plt.yticks(range(len(indices)), [feature_names[i] for i in indices]) # Sorting y ticks
plt.xlabel("Relative Importance")
plt.show()

In [None]:
# This code I used from the professor!
f_names=list(X_train_outliers.columns) # Feature names
fig, axes = plt.subplots(nrows=1, ncols=5, figsize=(10,2), dpi=3000) # Setup the 1x5 grid for ploting the trees
for index in range(0, 5): # Iterate through the first five trees (of 100)
  tree.plot_tree(rf1.estimators_[index], feature_names=f_names, filled = True, ax=axes[index]); # Plot the tree
  axes[index].set_title('Estimator: ' + str(index), fontsize = 11) # Print title

### Testing Without Outliers

In [None]:
rf2 = RandomForestClassifier(random_state=0) # Random forest classifier object
rf2.fit(X_train_no_outliers, y_train_no_outliers) # Training it
test_pred = rf2.predict(X_test)
Calculate_Model_Scores(test_pred,y_test)
Confusion_Matrix(test_pred,y_test)

In [None]:
feature_names = list(X_train_no_outliers.columns)
print(feature_names)

In [None]:
importances = rf2.feature_importances_
indices = np.argsort(importances) # Sorting importances

plt.figure(figsize=(20, 100)) # Making a large feature
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], align="center") # Making the graph look presentable with a bar
plt.yticks(range(len(indices)), [feature_names[i] for i in indices]) # Sorting y ticks
plt.xlabel("Relative Importance")
plt.show()

In [None]:
# This code I used from the professor!
f_names=list(X_train_no_outliers.columns) # Feature names
fig, axes = plt.subplots(nrows=1, ncols=5, figsize=(10,2), dpi=3000) # Setup the 1x5 grid for ploting the trees
for index in range(0, 5): # Iterate through the first five trees (of 100)
  tree.plot_tree(rf2.estimators_[index], feature_names=f_names, filled = True, ax=axes[index]); # Plot the tree
  axes[index].set_title('Estimator: ' + str(index), fontsize = 11) # Print title

## Model 3 -LogisticRegression

### Testing with Outliers

In [None]:
clf1 = LogisticRegression(random_state=0).fit(X_train_outliers, y_train_outliers)
test_pred = clf1.predict(X_test)
Calculate_Model_Scores(test_pred,y_test)
Confusion_Matrix(test_pred,y_test)

### Testing with no Outliers


In [None]:
clf2 = LogisticRegression(random_state=0).fit(X_train_no_outliers, y_train_no_outliers)
test_pred = clf2.predict(X_test)
Calculate_Model_Scores(test_pred,y_test)
Confusion_Matrix(test_pred,y_test)

## Model 4 - AdaBoost

Model Should perform better since it works decently well with class imbalance.

### Testing with Outliers

In [None]:
clf3 = AdaBoostClassifier(n_estimators=100, random_state=0)
clf3.fit(X_train_outliers, y_train_outliers)
test_pred = clf3.predict(X_test)
Calculate_Model_Scores(test_pred,y_test)
Confusion_Matrix(test_pred,y_test)

### Testing without Outliers

In [None]:
clf4 = AdaBoostClassifier(n_estimators=100, random_state=0)
clf4.fit(X_train_no_outliers, y_train_no_outliers)
test_pred = clf4.predict(X_test)
Calculate_Model_Scores(test_pred,y_test)
Confusion_Matrix(test_pred,y_test)

## Model 5 - XG Boost

### Testing with Outliers

In [None]:
clf5 = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X_train_outliers, y_train_outliers)
test_pred = clf5.predict(X_test)
Calculate_Model_Scores(test_pred,y_test)
Confusion_Matrix(test_pred,y_test)

### Testing without Outliers

In [None]:
clf6 = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X_train_no_outliers, y_train_no_outliers)
test_pred = clf6.predict(X_test)
Calculate_Model_Scores(test_pred,y_test)
Confusion_Matrix(test_pred,y_test)

## Model 6 - RandomForestClassifier with Bayesian

In [None]:
from sklearn.ensemble import RandomForestClassifier
from hyperopt import fmin,tpe,Trials,STATUS_OK,hp
from sklearn.model_selection import cross_val_score

In [None]:
RNF = RandomForestClassifier()

In [None]:
Space = {
    'n_estimators' : hp.quniform('n_estimators',50,500,50),
    'criterion' : hp.choice('criterion',["gini", "entropy", "log_loss"]),
    'max_depth' : hp.quniform('max_depth',1,10,1)

}

In [None]:
def Bayesian(Space):
  RNF = RandomForestClassifier(n_estimators = int(Space['n_estimators']),
                               criterion=Space['criterion'],
                               max_depth=int(Space['max_depth']))
  accuracy = cross_val_score(RNF,X_train_no_outliers,y_train_no_outliers,cv=5).mean()
  return {'loss':-accuracy , 'status' : STATUS_OK}

In [None]:
trials = Trials()

In [None]:
Best = fmin(fn=Bayesian,space=Space,algo=tpe.suggest,trials=trials,max_evals=50)

In [None]:
Best

In [None]:
RNF = RandomForestClassifier(criterion = 'entropy', max_depth = 7, n_estimators=400)

In [None]:
RNF.fit(X_train_no_outliers,y_train_no_outliers)

In [None]:
y_pred_bayes = RNF.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [None]:
Calculate_Model_Scores(y_pred_bayes,y_test)

In [None]:
print(classification_report(y_test,y_pred_bayes))

In [None]:
Confusion_Matrix(y_pred_bayes,y_test)

## Model 7 - Support Vector Machines

In [None]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train_no_outliers, y_train_no_outliers)
y_pred_svm = svm.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred_svm))

In [None]:
Confusion_Matrix(y_pred_svm,y_test)