# Importing the required libraries

In [16]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import statsmodels.formula.api as sma
import statsmodels.api as sm
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import confusion_matrix, accuracy_score 
from sklearn.metrics import f1_score, classification_report
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor

%matplotlib inline

from IPython.display import display, HTML
display(HTML("<style>.container{ width: 90%; !important}</style>"))

# Exploratory Data Analysis (EDA)
    1. Merge one table at a time
	2. While analyzing the object based columns, if some columns have only one kind of value, better to delete such columns as they would just increase the data size and provide no such informationas its common across all the data points.
	3. If different tables have different ways to store dates, then we need to bring them all together to the highest possible hierarchy, hour-day-date-week-month-quarter-year-decade-century.
		a. Might need to aggregate few columns using median, mean, etc. totally depends on the type of data.
		b. Aggregation is needed. But the method needs to be decided.
	4. Once we merge all the tables, we try to fill columns in which we use some complex method of Imputations.
	5. The combined data should not have any irrelevant records. It will impact the analysis.
	6. Try to explore the dependent variable wrt to final data created.
	7. Treat the missing values.
    8. Perform the outliers treatment.

## EDA for Table:

### Looking at the table characterstics

In [2]:
#Loading the data
data = pd.read_csv(r'C:\Dropbox\GenAI\Analytics_Vidhya\Foundational_ML_Algorithms_I\Project_Anova_Insurance_Decision_Tree\Healthcare_Dataset_Preprocessed.csv')
data.head()

Unnamed: 0,Age,BMI,Blood_Pressure,Cholesterol,Glucose_Level,Heart_Rate,Sleep_Hours,Exercise_Hours,Water_Intake,Stress_Level,...,Diet,MentalHealth,PhysicalActivity,MedicalHistory,Allergies,Diet_Type_Vegan,Diet_Type_Vegetarian,Blood_Group_AB,Blood_Group_B,Blood_Group_O
0,2.0,26.0,111.0,198.0,99.0,72.0,4.0,1.0,5.0,5.0,...,1,2,1,0,1,False,True,True,False,False
1,8.0,24.0,121.0,199.0,103.0,75.0,2.0,1.0,2.0,9.0,...,1,2,1,2,2,False,False,True,False,False
2,81.0,27.0,147.0,203.0,100.0,74.0,10.0,-0.0,5.0,1.0,...,2,0,0,1,0,True,False,False,False,False
3,25.0,21.0,150.0,199.0,102.0,70.0,7.0,3.0,3.0,3.0,...,1,2,1,2,0,True,False,False,True,False
4,24.0,26.0,146.0,202.0,99.0,76.0,10.0,2.0,5.0,1.0,...,2,0,2,0,2,False,True,False,True,False


In [5]:
data.dtypes

Age                     float64
BMI                     float64
Blood_Pressure          float64
Cholesterol             float64
Glucose_Level           float64
Heart_Rate              float64
Sleep_Hours             float64
Exercise_Hours          float64
Water_Intake            float64
Stress_Level            float64
Target                    int64
Smoking                   int64
Alcohol                   int64
Diet                      int64
MentalHealth              int64
PhysicalActivity          int64
MedicalHistory            int64
Allergies                 int64
Diet_Type_Vegan            bool
Diet_Type_Vegetarian       bool
Blood_Group_AB             bool
Blood_Group_B              bool
Blood_Group_O              bool
dtype: object

In [3]:
data.describe()

Unnamed: 0,Age,BMI,Blood_Pressure,Cholesterol,Glucose_Level,Heart_Rate,Sleep_Hours,Exercise_Hours,Water_Intake,Stress_Level,Target,Smoking,Alcohol,Diet,MentalHealth,PhysicalActivity,MedicalHistory,Allergies
count,9549.0,9549.0,9549.0,9549.0,9549.0,9549.0,9549.0,9549.0,9549.0,9549.0,9549.0,9549.0,9549.0,9549.0,9549.0,9549.0,9549.0,9549.0
mean,33.806786,25.660697,130.382658,199.091528,100.225678,73.613782,6.951409,1.892345,3.580899,4.382134,0.521416,0.99047,0.995183,1.005864,0.998429,1.003351,1.004713,0.989318
std,24.566473,1.942369,27.878476,1.969234,2.157999,1.681538,2.352152,1.378714,1.622874,2.078593,0.499567,0.815521,0.816653,0.815877,0.821844,0.8088,0.813506,0.815699
min,0.0,19.0,22.0,192.0,93.0,67.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,14.0,24.0,113.0,198.0,99.0,73.0,5.0,1.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,29.0,26.0,134.0,199.0,100.0,74.0,7.0,2.0,4.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
75%,50.0,27.0,150.0,200.0,102.0,75.0,9.0,3.0,5.0,6.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
max,100.0,32.0,225.0,207.0,107.0,80.0,14.0,8.0,10.0,12.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [4]:
data.describe(include = 'O')

ValueError: No objects to concatenate

### Removing/Dropping any non-informational column

### Looking at pairplots, boxplots for insights. Treating Outliers
    1. Detect outliers- 
    	1. Box plots, histograms for univariate.
    	2. Scatter plots for bivariate data.
    	3. Z-score of 3-std dev. Can be helpful.
    	4. IQR can help with limits. 
    2. Treat outliers-
    	1. Remove them by deleting the rows  - NOT RECOMMENDED, INFORMANTION LOSS.
    	2. Transform Values - use mathematical fxns.
    	3. Flooring and capping - most generic
    	4. Mean-Median Imputation
Treating outliers is not a necessity but recommended practice. It sometimes is obvious when we see the data.

### Analysis:
    1.
    2.
    3.

### Merging the table

## EDA for Table:

### Looking at the table characterstics

In [2]:
#Loading the data
data = pd.read_csv(r'C:\Dropbox\GenAI\Analytics_Vidhya\Foundational_ML_Algorithms_I\Project_Anova_Insurance_Decision_Tree\Healthcare_Dataset_Preprocessed.csv')
data.head()

Unnamed: 0,Age,BMI,Blood_Pressure,Cholesterol,Glucose_Level,Heart_Rate,Sleep_Hours,Exercise_Hours,Water_Intake,Stress_Level,...,Diet,MentalHealth,PhysicalActivity,MedicalHistory,Allergies,Diet_Type_Vegan,Diet_Type_Vegetarian,Blood_Group_AB,Blood_Group_B,Blood_Group_O
0,2.0,26.0,111.0,198.0,99.0,72.0,4.0,1.0,5.0,5.0,...,1,2,1,0,1,False,True,True,False,False
1,8.0,24.0,121.0,199.0,103.0,75.0,2.0,1.0,2.0,9.0,...,1,2,1,2,2,False,False,True,False,False
2,81.0,27.0,147.0,203.0,100.0,74.0,10.0,-0.0,5.0,1.0,...,2,0,0,1,0,True,False,False,False,False
3,25.0,21.0,150.0,199.0,102.0,70.0,7.0,3.0,3.0,3.0,...,1,2,1,2,0,True,False,False,True,False
4,24.0,26.0,146.0,202.0,99.0,76.0,10.0,2.0,5.0,1.0,...,2,0,2,0,2,False,True,False,True,False


In [5]:
data.dtypes

Age                     float64
BMI                     float64
Blood_Pressure          float64
Cholesterol             float64
Glucose_Level           float64
Heart_Rate              float64
Sleep_Hours             float64
Exercise_Hours          float64
Water_Intake            float64
Stress_Level            float64
Target                    int64
Smoking                   int64
Alcohol                   int64
Diet                      int64
MentalHealth              int64
PhysicalActivity          int64
MedicalHistory            int64
Allergies                 int64
Diet_Type_Vegan            bool
Diet_Type_Vegetarian       bool
Blood_Group_AB             bool
Blood_Group_B              bool
Blood_Group_O              bool
dtype: object

In [3]:
data.describe()

Unnamed: 0,Age,BMI,Blood_Pressure,Cholesterol,Glucose_Level,Heart_Rate,Sleep_Hours,Exercise_Hours,Water_Intake,Stress_Level,Target,Smoking,Alcohol,Diet,MentalHealth,PhysicalActivity,MedicalHistory,Allergies
count,9549.0,9549.0,9549.0,9549.0,9549.0,9549.0,9549.0,9549.0,9549.0,9549.0,9549.0,9549.0,9549.0,9549.0,9549.0,9549.0,9549.0,9549.0
mean,33.806786,25.660697,130.382658,199.091528,100.225678,73.613782,6.951409,1.892345,3.580899,4.382134,0.521416,0.99047,0.995183,1.005864,0.998429,1.003351,1.004713,0.989318
std,24.566473,1.942369,27.878476,1.969234,2.157999,1.681538,2.352152,1.378714,1.622874,2.078593,0.499567,0.815521,0.816653,0.815877,0.821844,0.8088,0.813506,0.815699
min,0.0,19.0,22.0,192.0,93.0,67.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,14.0,24.0,113.0,198.0,99.0,73.0,5.0,1.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,29.0,26.0,134.0,199.0,100.0,74.0,7.0,2.0,4.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
75%,50.0,27.0,150.0,200.0,102.0,75.0,9.0,3.0,5.0,6.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
max,100.0,32.0,225.0,207.0,107.0,80.0,14.0,8.0,10.0,12.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [8]:
data.describe(include = 'O')

### Removing/Dropping any non-informational column

### Looking at pairplots, boxplots for insights. Treating Outliers
1. Detect outliers- 
    1. Box plots, histograms for univariate.
    2. Scatter plots for bivariate data.
    3. Z-score of 3-std dev. Can be helpful.
    4. IQR can help with limits. 
2. Treat outliers-
    1. Remove them by deleting the rows  - NOT RECOMMENDED, INFORMANTION LOSS.
    2. Transform Values - use mathematical fxns.
    3. Flooring and capping - most generic
    4. Mean-Median Imputation
Treating outliers is not a necessity but recommended practice. It sometimes is obvious when we see the data.

### Analysis:
    1.
    2.
    3.

### Merging the table

# Data Preparation
Check the data and prepare it for the type of ML model we want to use supervised, unsupervise, reinforcement learning.

	1. Usually data is mostly ready for unsupervised models.
	2. We need to randomize the data from date PoV for supervised model to make a date base reduction.
	3. Sometimes we might need to add a new column for classification based column or regression based column based on the data set.
	4. Remove irrelevant columns from the final data set once again.

AI Based data pre-processing
    
    1. Use AI to expedite these manual processes.
Give promts to make codes for segments we want to do in our EDA or Data Preprocessing or Data Preperation.

#### Converting Categorical boolean variables to 1,0 

In [17]:
data.dtypes.to_frame()

Unnamed: 0,0
Age,float64
BMI,float64
Blood_Pressure,float64
Cholesterol,float64
Glucose_Level,float64
Heart_Rate,float64
Sleep_Hours,float64
Exercise_Hours,float64
Water_Intake,float64
Stress_Level,float64


In [21]:
'''var = ['Diet_Type_Vegan', 'Diet_Type_Vegetarian', 'Blood_Group_AB', 'Blood_Group_B', 'Blood_Group_O']

for i in var:
    data[var] = data[var].apply(lambda x: 1 if x == True else 0)'''

data['Diet_Type_Vegan'] = data['Diet_Type_Vegan'].apply(lambda x: 1 if x == True else 0)

data['Diet_Type_Vegetarian'] = data['Diet_Type_Vegetarian'].apply(lambda x: 1 if x == True else 0)

data['Blood_Group_AB'] = data['Blood_Group_AB'].apply(lambda x: 1 if x == True else 0)

data['Blood_Group_B'] = data['Blood_Group_B'].apply(lambda x: 1 if x == True else 0)

data['Blood_Group_O'] = data['Blood_Group_O'].apply(lambda x: 1 if x == True else 0)

data.head()

Unnamed: 0,Age,BMI,Blood_Pressure,Cholesterol,Glucose_Level,Heart_Rate,Sleep_Hours,Exercise_Hours,Water_Intake,Stress_Level,...,Diet,MentalHealth,PhysicalActivity,MedicalHistory,Allergies,Diet_Type_Vegan,Diet_Type_Vegetarian,Blood_Group_AB,Blood_Group_B,Blood_Group_O
0,2.0,26.0,111.0,198.0,99.0,72.0,4.0,1.0,5.0,5.0,...,1,2,1,0,1,0,1,1,0,0
1,8.0,24.0,121.0,199.0,103.0,75.0,2.0,1.0,2.0,9.0,...,1,2,1,2,2,0,0,1,0,0
2,81.0,27.0,147.0,203.0,100.0,74.0,10.0,-0.0,5.0,1.0,...,2,0,0,1,0,1,0,0,0,0
3,25.0,21.0,150.0,199.0,102.0,70.0,7.0,3.0,3.0,3.0,...,1,2,1,2,0,1,0,0,1,0
4,24.0,26.0,146.0,202.0,99.0,76.0,10.0,2.0,5.0,1.0,...,2,0,2,0,2,0,1,0,1,0


### Separating the feature variable and the target variable

In [23]:
X = data.drop(columns = 'Target')
y = data['Target']

X.shape, y.shape

((9549, 22), (9549,))

<h3>Performing train-test split</h3>

In [30]:
# Importing the train-test split from scikit-learn
from sklearn.model_selection import train_test_split

# Performing train and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

y_train.value_counts(normalize = 1), y_test.value_counts(normalize = 1)

(Target
 1    0.51915
 0    0.48085
 Name: proportion, dtype: float64,
 Target
 1    0.526702
 0    0.473298
 Name: proportion, dtype: float64)

# Modelling and Evaluation

## We build different models and evaluate them as needed

In [31]:
# Import DecisionTreeRegressor from sklearn.tree
from sklearn.tree import DecisionTreeRegressor

# Creating a decision tree classifier object called DT_model
DT_model = DecisionTreeRegressor(random_state = 42)

# Building the model using the training data
DT_model.fit(X_train, y_train)

In [32]:
# Perfoming prediction on both the train and test data
y_pred_train = DT_model.predict(X_train)
y_pred_test = DT_model.predict(X_test)

<h3>Calculating feature importance</h3>

In [36]:
#Retreiving the score of each feature
feature_imp = DT_model.feature_importances_

#  Sort the features by importance in descending order in a dataframe.
imp = pd.DataFrame({'Col_names': X_train.columns, 'Importance': feature_imp}).sort_values(by='Importance', ascending=False)
# Calculate the cumulative sum of the 'Importance' column and store it in a new column called 'cum_imp'
imp['cum_imp'] = imp.Importance.cumsum()
imp

Unnamed: 0,Col_names,Importance,cum_imp
1,BMI,0.224797,0.224797
3,Cholesterol,0.168142,0.39294
2,Blood_Pressure,0.123932,0.516872
4,Glucose_Level,0.094741,0.611613
9,Stress_Level,0.091766,0.703379
6,Sleep_Hours,0.074753,0.778132
8,Water_Intake,0.058018,0.836151
0,Age,0.04873,0.884881
5,Heart_Rate,0.04244,0.927321
7,Exercise_Hours,0.023483,0.950804


In [37]:
drop_col = imp[imp.cum_imp >  0.99]['Col_names'].to_list()
drop_col

['Diet', 'Diet_Type_Vegan', 'Blood_Group_B', 'Blood_Group_AB', 'Blood_Group_O']

In [38]:
# Dropping the columns from X_train
X_train.drop(columns = drop_col, axis = 1, inplace = True)

In [39]:
# Dropping the columns from X_test
X_test.drop(columns = drop_col, axis = 1, inplace = True)

### Hyperparameter tuning

In [45]:
# Storing the depth used by default in the model
depth = DT_model.get_depth()

In [46]:
depth

22

In [47]:
# List of max_depth values 
max_depth_list = list(range(depth,0,-3))

In [48]:
# Dictionary to store the train and test r square scores

train_scores = {}
test_scores = {}

# Loop through max_depth values and train the models
for depth in max_depth_list:
    # Initialize the Decision Tree model with the current max_depth value
    DT_model = DecisionTreeRegressor(max_depth=depth, random_state=42)
    
    # Train the model
    DT_model.fit(X_train, y_train)
    
    # Make predictions on the train dataset
    y_train_pred = DT_model.predict(X_train)

    # Make predictions on the test dataset
    y_test_pred = DT_model.predict(X_test)
    
    # Store the train r square score in the dictionary with the max_depth as the key
    train_scores[depth] = r2_score(y_train, y_train_pred)
    
    # Store the test r square score in the dictionary with the max_depth as the key
    test_scores[depth] = r2_score(y_test, y_test_pred)
    

# Print the train and test r square scores for each model
for depth in max_depth_list:
    print(f"max_depth = {depth}|\
    Train Score = {train_scores[depth]:.3f} |\
    Test score = {test_scores[depth]:.3f}")
    print('_'*65)

max_depth = 22|    Train Score = 1.000 |    Test score = 0.605
_________________________________________________________________
max_depth = 19|    Train Score = 0.998 |    Test score = 0.616
_________________________________________________________________
max_depth = 16|    Train Score = 0.984 |    Test score = 0.637
_________________________________________________________________
max_depth = 13|    Train Score = 0.936 |    Test score = 0.657
_________________________________________________________________
max_depth = 10|    Train Score = 0.849 |    Test score = 0.679
_________________________________________________________________
max_depth = 7|    Train Score = 0.705 |    Test score = 0.646
_________________________________________________________________
max_depth = 4|    Train Score = 0.489 |    Test score = 0.513
_________________________________________________________________
max_depth = 1|    Train Score = 0.171 |    Test score = 0.175
_____________________________________

In [51]:
# Dictionary to store the train and test r square scores

train_scores = {}
test_scores = {}

# Fill the values in the below list and assign value to the variable!
min_sample_leaf_list = list(range(50, 0, -5))
depth = 7

# Loop through min_sample_leaf values and train the models
for min_sample_leaf in min_sample_leaf_list:
    
    # Initialize the Decision Tree model with the current min_samples_leaf value
    DT_model = DecisionTreeRegressor(min_samples_leaf=min_sample_leaf, max_depth=depth, random_state=42)
    
    # Train the model
    DT_model.fit(X_train, y_train)
    
    # Make predictions on the train dataset
    y_train_pred = DT_model.predict(X_train)

    # Make predictions on the test dataset
    y_test_pred = DT_model.predict(X_test)
    
    # Store the train r square score in the dictionary with the min_sample_leaf as the key
    train_scores[min_sample_leaf] = r2_score (y_train, y_train_pred)
    
    # Store the test r square score in the dictionary with the min_sample_leaf as the key
    test_scores[min_sample_leaf] = r2_score (y_test, y_test_pred)
    

# Print the train and test r square scores for each model
for min_sample_leaf in min_sample_leaf_list:
    print(f"min_sample_leaf = {min_sample_leaf}|\
    Train Score = {train_scores[min_sample_leaf]:.3f} |\
    Test Score = {test_scores[min_sample_leaf]:.3f}")
    print('_'*65)

min_sample_leaf = 50|    Train Score = 0.611 |    Test Score = 0.618
_________________________________________________________________
min_sample_leaf = 45|    Train Score = 0.629 |    Test Score = 0.632
_________________________________________________________________
min_sample_leaf = 40|    Train Score = 0.637 |    Test Score = 0.639
_________________________________________________________________
min_sample_leaf = 35|    Train Score = 0.640 |    Test Score = 0.641
_________________________________________________________________
min_sample_leaf = 30|    Train Score = 0.645 |    Test Score = 0.643
_________________________________________________________________
min_sample_leaf = 25|    Train Score = 0.653 |    Test Score = 0.641
_________________________________________________________________
min_sample_leaf = 20|    Train Score = 0.663 |    Test Score = 0.640
_________________________________________________________________
min_sample_leaf = 15|    Train Score = 0.673 |    Test 

### Creating final model

In [52]:
DT_model = DecisionTreeRegressor(min_samples_leaf=30, max_depth=7, random_state=42)

In [55]:
# Train the model
DT_model.fit(X_train, y_train)

# Make predictions on the train dataset
y_train_pred = DT_model.predict(X_train)

# Make predictions on the test dataset
y_test_pred = DT_model.predict(X_test)

# Store the train r square score in the dictionary with the min_sample_leaf as the key
train_scores = r2_score (y_train, y_train_pred)

# Store the test r square score in the dictionary with the min_sample_leaf as the key
test_scores = r2_score (y_test, y_test_pred)

In [59]:
print(f"Train Score = {round(train_scores, 3)} | Test Score = {round(test_scores, 3)}")

Train Score = 0.645 | Test Score = 0.643
