# Assignment Answers

In [None]:
# Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from warnings import filterwarnings

In [None]:
# Filtering Warnings
filterwarnings('ignore')

In [None]:
# Path of data
path = 'Datasets/diabetes.csv'

In [None]:
# Reading File
df = pd.read_csv(path)

In [None]:
df.head()

# 1.

In [None]:
# Examining the dataset

# Checking types of features present
df.dtypes

From above output, we can say that every features is of numeric type.

BMI and DiabetesPedigreeFunction are of float type, while rest other features are of integer type.

In [None]:
# Checking statistics of the features
df.describe()

In [None]:
# Ploting pair plot to see relationship between each feature
plt.figure(figsize=(12,6))
sns.pairplot(df.iloc[:,:-1],markers='+',diag_kind="hist",corner=True)
plt.show()

# 2.

In [None]:
# Checking for null values
df.isnull().sum()

From the above output, we can say that there are no null values present

In [None]:
# Checking for duplicate values
df.duplicated().sum()

From the above output, we can say that there are no duplicate records present

In [None]:
# Checking for outliers using box plot
plt.figure(figsize=(12,6))
sns.boxplot(data=df.iloc[:,:-1])

From the above diagram, we can say that almost every feature has outliers present in them.

Therefore, we need to remove them.


In [None]:
# Function to remove outliers
def remove_outlier(df,col_name):
        q1 = df[col_name].quantile(0.25)
        q3 = df[col_name].quantile(0.75)
        iqr = q3-q1 #Interquartile range
        fence_low  = q1-1.5*iqr
        fence_high = q3+1.5*iqr
        df_new = df.loc[(df[col_name] > fence_low) & (df[col_name] < fence_high)]
        
        return df_new

# Remove Outliers
df_new = remove_outlier(df,'Insulin')

In [None]:
# Scaling the datasets
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Storing the scaled data into new dataframe
df_new2 = pd.DataFrame(scaler.fit_transform(df_new.iloc[:,:-1]),columns=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
       'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age'])
df_new2["Outcome"] = df["Outcome"]

In [None]:
# Checking for outliers after removing some of the outliers
plt.figure(figsize=(12,6))
sns.boxplot(data=df_new2.iloc[:,:-1])

# 3.

In [None]:
# Importing Library
from sklearn.model_selection import train_test_split

In [None]:
# Splitting dataset into training and testing dataset
X = df_new2.iloc[:,:-1]
y = df_new2.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# 4.

In [None]:
# Importing Library
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text
from sklearn.model_selection import GridSearchCV

Training the model

In [None]:

decision_tree = DecisionTreeClassifier()
decision_tree = decision_tree.fit(X_train, y_train)
y_pred1 = decision_tree.predict(X_test)

 Hypertuning parameter using Gridsearch cv

In [None]:
# Importing library
from sklearn.model_selection import GridSearchCV

In [None]:
parameters={
 'criterion':['gini','entropy','log_loss'],
  'splitter':['best','random'],
  'max_depth':[1,2,3,4,5],
  'max_features':['auto', 'sqrt', 'log2']
    
}
# initializing classifier
clf=GridSearchCV(decision_tree,param_grid=parameters,cv=5,scoring='accuracy')

In [None]:
clf = clf.fit(X_train,y_train)
y_pred2 = clf.predict(X_test)

In [None]:
# Checking for best parameters
clf.best_params_

From the above output, we get the following optimal parameters:

criterion: 'gini'<br>
max_depth: 2<br>
max_features: 'auto'<br>
splitter: 'random'<br>

# 5.

In [None]:
# Importing libreay
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,ConfusionMatrixDisplay,roc_curve,RocCurveDisplay

In [None]:
accuracy,precision,recall,f1 = accuracy_score(y_pred1,y_test),precision_score(y_pred1,y_test),recall_score(y_pred1,y_test),f1_score(y_pred1,y_test)
print("Metrics before hypertuning of parameter\n\n")
print("{}".format("="*60))
print(classification_report(y_pred1,y_test))
print("\n\nConfusion Matrix:")
print(confusion_matrix(y_test,y_pred1))
print("{}".format("="*60))
plt.figure(figsize=(12,8))
cm = confusion_matrix(y_test, y_pred1)
cm_display = ConfusionMatrixDisplay(cm).plot()
print("ROC Curve:")
fpr, tpr, _ = roc_curve(y_test, y_pred1, pos_label=clf.classes_[1])
roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()
plt.show()

In [None]:
accuracy,precision,recall,f1 = accuracy_score(y_pred2,y_test),precision_score(y_pred2,y_test),recall_score(y_pred2,y_test),f1_score(y_pred2,y_test)
print("Metrics after hypertuning of parameter\n\n")
print("{}".format("="*60))
print(classification_report(y_pred2,y_test))
print("\n\nConfusion Matrix:")
print(confusion_matrix(y_test,y_pred2))
print("{}".format("="*60))
print("Confusion Matrix:")
plt.figure(figsize=(12,8))
cm = confusion_matrix(y_test, y_pred2)
cm_display = ConfusionMatrixDisplay(cm).plot()
print("ROC Curve:")
fpr, tpr, _ = roc_curve(y_test, y_pred2, pos_label=clf.classes_[1])
roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()
plt.show()

# 6.

In [None]:
# Visualizing decision tree
from sklearn import tree
plt.figure(figsize=(20,20))
tree.plot_tree(decision_tree,filled=True)
plt.show()

In [None]:
decision_tree_new = DecisionTreeClassifier(criterion= 'gini',max_depth= 1,max_features= 'sqrt',splitter= 'random')
decision_tree_new = decision_tree_new.fit(X_train, y_train)
r = export_text(decision_tree_new, feature_names=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI', 'DiabetesPedigreeFunction', 'Age'])
print(r)

From the above output, we get the following results:
- The root node of feature of this tree is __BMI__ with the threshold of __2.50__.

# 7.

In [None]:
# Importing Library
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,ConfusionMatrixDisplay,roc_curve,RocCurveDisplay

In [None]:
# Validating model on new data
decision_tree = DecisionTreeClassifier()
decision_tree = decision_tree.fit(X_train, y_train)
y_pred1 = decision_tree.predict(X_test)

In [None]:
# Checking result
accuracy= accuracy_score(y_pred1,y_test)
print("{}".format("="*60))
print(classification_report(y_pred1,y_test))
print("{}".format("="*60))
plt.figure(figsize=(12,8))
cm = confusion_matrix(y_test, y_pred1)
cm_display = ConfusionMatrixDisplay(cm).plot()