<a href="https://colab.research.google.com/github/Deli8t/data/blob/master/Delight_Model_Quality%26_ImprovementsL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Project Notebook -Model Quality & Improvement

# Problem Statement
As a data professional working for a pharmaceutical company, you need to develop a
model that predicts whether a patient will be diagnosed with diabetes. The model needs
to have an accuracy score greater than 0.85.






# 1.Data Importation

In [None]:
#We will now need the required libraries to read our csv file 
# from an external  source. 

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler


diabetes2 = pd.read_csv('/content/diabetes2.csv')
                 
diabetes2.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


#2. Data Exploration

In [None]:
# explore the size of a dataset
diabetes2.shape


(768, 9)

In [None]:
#explore the first and last records of a dataset

diabetes2[0:-5] 


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
758,1,106,76,0,0,37.5,0.197,26,0
759,6,190,92,0,0,35.5,0.278,66,1
760,2,88,58,26,16,28.4,0.766,22,0
761,9,170,74,31,0,44.0,0.403,43,1


In [None]:
#explore the data types 

diabetes2.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

# 3.Data Cleaning & Preparation

In [None]:
# Looking for null values in the  dataset
diabetes2.isnull()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
763,False,False,False,False,False,False,False,False,False
764,False,False,False,False,False,False,False,False,False
765,False,False,False,False,False,False,False,False,False
766,False,False,False,False,False,False,False,False,False


In [None]:
# Confirming that dataset does not have any missing values

diabetes2[diabetes2.isnull()]


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,,,,,,,,,
1,,,,,,,,,
2,,,,,,,,,
3,,,,,,,,,
4,,,,,,,,,
...,...,...,...,...,...,...,...,...,...
763,,,,,,,,,
764,,,,,,,,,
765,,,,,,,,,
766,,,,,,,,,


In [None]:
# Removing duplicated rows from the  Dataframe

diabetes2.drop_duplicates()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [None]:
#checking if duplicates have been droppoed
diabetes2.duplicated().sum()

0

In [None]:
#previewing the dataframe after cleaning

diabetes2.sample(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
285,7,136,74,26,135,26.0,0.647,51,0
617,2,68,62,13,15,20.1,0.257,23,0
599,1,109,38,18,120,23.1,0.407,26,0
593,2,82,52,22,115,28.5,1.699,25,0
271,2,108,62,32,56,25.2,0.128,21,0


In [None]:
#We start by Standardizing the numeric features.
#Then we create an instance of the StandardScaler() class and tune it using the training data. 

#data loading and splitting
diabetes2 = pd.read_csv('/content/diabetes2.csv')

target = diabetes2['Outcome']
features = diabetes2.drop('Outcome', axis=1)
features_train, features_valid, target_train, target_valid = train_test_split(
    features, target, test_size=0.25, random_state=12345
)

numeric = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness','Insulin',	'BMI','DiabetesPedigreeFunction',	'Age']

#stanardizing numeric features
scaler = StandardScaler()
scaler.fit(features_train[numeric])
features_train[numeric] = scaler.transform(features_train[numeric])

#transform validation set 
features_valid[numeric] = scaler.transform(features_valid[numeric])


#print(features_train.head())
features_train.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
731,1.193924,-0.080385,0.856104,-1.270653,-0.677761,-0.445161,-0.635852,-0.986616
33,0.601845,-0.956951,1.1738,-1.270653,-0.677761,-1.495375,-0.855303,-0.474642
221,-0.582314,1.109242,1.067901,-1.270653,-0.677761,-0.049786,1.051754,2.767857
534,-0.878354,-1.426541,-0.732372,0.600885,-0.194487,0.160256,2.430275,-0.815958
62,0.305805,-2.459637,-0.414677,-1.270653,-0.677761,-0.865247,0.377948,0.207989


# 4.Data Modeling
 (Using Decision Trees, Random Forest and Logistic Regression)

In [None]:
#Decision Tree
#spliting dataset into train & test 

diabetes2 = pd.read_csv('/content/diabetes2.csv')

diabetes2_train  = train_test_split(diabetes2, test_size=0.25, random_state=12345)

test_features = diabetes2.drop(['Outcome'], axis=1)
test_target = diabetes2['Outcome'] 

#create model
model = DecisionTreeRegressor (random_state=12345)
#training model
model.fit(features, target)

#predictions
predictions = model.predict(test_features)

mse = mean_squared_error(test_target, predictions)

#Calculate RMSE

print('RMSE:', mse**0.5)



RMSE: 0.0


In [None]:
#Random Forest
#spliting dataset into train & test 


diabetes2 = pd.read_csv('/content/diabetes2.csv')

diabetes2_train  = train_test_split(diabetes2, test_size=0.25, random_state=12345)

test_features = diabetes2.drop(['Outcome'], axis=1)
test_target = diabetes2['Outcome'] 


#create model
model = RandomForestRegressor (random_state=12345, n_estimators=3)
#training model
model.fit(features, target)

#predictions
predictions = model.predict(test_features)

mse = mean_squared_error(test_target, predictions)

#Calculate RMSE

print('RMSE:', mse**0.5)

RMSE: 0.22916666666666669


In [None]:
#Logistic Regression
#spliting dataset into train & test 

diabetes2 = pd.read_csv('/content/diabetes2.csv')

diabetes2_train  = train_test_split(diabetes2, test_size=0.25, random_state=12345)

test_features = diabetes2.drop(['Outcome'], axis=1)
test_target = diabetes2['Outcome'] 


features_train = diabetes2.drop(['Outcome'], axis=1)
target_train = diabetes2['Outcome'] 

#create model
model = LogisticRegression(random_state=12345, solver='liblinear')
#train model
model.fit(features_train,target_train)

#checking score accuracy
mse =(model.score(features_train, target_train))

print('RMSE:', mse**0.5)

RMSE: 0.8809322713277489


# 5.Model Evaluation

In [None]:
#Training the best model with optimal hyperparameters on the entire dataset. 

#reading dataset and spliting data
diabetes2 = pd.read_csv('/content/diabetes2.csv')
diabetes2_train  = train_test_split(diabetes2, test_size=0.25, random_state=12345)


test_features = diabetes2.drop(['Outcome'], axis=1)
test_target = diabetes2['Outcome'] 

#create model
final_model = DecisionTreeRegressor (random_state=12345)
#training model
final_model.fit(features, target)

#predictions
predictions = final_model.predict(test_features)

mse = mean_squared_error(test_target, predictions)

#Calculate RMSE

mse = mean_squared_error(test_target, predictions)

rmse = mse ** 0.5
print('RMSE:', mse**0.5)


RMSE: 0.0


# 6.Hyparameter Tuning

In [None]:

#reading daaframe and spliting dataset

diabetes2 = pd.read_csv('/content/diabetes2.csv')

diabetes2_train  = train_test_split(diabetes2, test_size=0.25, random_state=12345)

test_features = diabetes2.drop(['Outcome'], axis=1)
test_target = diabetes2['Outcome'] 

features_valid = diabetes2.drop(['Outcome'], axis=1)
target_valid = diabetes2['Outcome'] 

#create model
model = DecisionTreeRegressor (random_state=12345)
#training model
model.fit(features, target)

#predictions
#predictions = model.predict(test_features)

#mse = mean_squared_error(test_target, predictions)

for depth in range(1, 6):
        model =  DecisionTreeClassifier(random_state=12345,max_depth=depth)

        model.fit(features_train, target_train)

       
        predictions_valid = model.predict(features_valid)

        print("max_depth =", depth, ": ", end='')
        print(accuracy_score(target_valid, predictions_valid))

max_depth = 1 : 0.3502604166666667
max_depth = 2 : 0.3502604166666667
max_depth = 3 : 0.5234375
max_depth = 4 : 0.6497395833333334
max_depth = 5 : 0.6510416666666666


# 7.Findings and Recommendations


After analysing the data and testing the various models for prediction and accuracy, i concluded that LogisticRegression is the best model with an Accuracy score of 0.88 (88% )
