# Data Modelling - Application of ML Algorithms

# Using Linear Regression, Lasso Regression and Random Forest

Application of ML Algorithms - Linear Regression, Ridge Regression and Random Forests to Predict the effect of Unhealthy Behaviours and Income on Obesity and Cancer

In [14]:
#Importing libraries 
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score,GridSearchCV,train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.decomposition import NMF,PCA
from sklearn.metrics import accuracy_score, r2_score
import statsmodels.api as sm
from scipy import stats
from sklearn.ensemble import RandomForestRegressor

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.float_format', lambda x: '%.9f' % x)

#Custom
zscore = lambda x: (x-x.mean())/x.std()
np.random.seed(21)

# Join the data and locations for ML analysis

In [15]:
data = pd.read_csv('data/cleansed/data.csv')
locations = pd.read_csv('data/cleansed/locations.csv',index_col=0)

In [16]:
# %load join_data_locations.py
#Join Location and and data dataframes
data_join = data.join(locations[['geographiclevel','stateabbr','cityname']]).copy()
# Keeping only census level data
census_data = data_join[data_join['geographiclevel'] == 'Census Tract']
# Drop unnecessary and NOT needed columns
census_data_req = census_data.drop(['datavaluetypeid','geographiclevel','stateabbr','cityname'],axis=1).reset_index().set_index('uniqueid')

# Initial Observations:
Part 1: Obesity
Features such as "MAMMOUSE", "PAPTEST", BPMED','TEETHLOST','DENTAL'does not make contextual sense in predecting Obesity
Also, some of the effects due to Obesity are already very well known.

Part 2: Cancer
With respect to CANCER, we'll also remove some contextual irrelevant features such as "MAMMOUSE", "TEETHLOST" and "DENTAL".

Thus removing features which don't make much contextual sense and are already well known and are highly correlated 

In [17]:
# %load feature_selection.py
# Contextualy irrelevant features for Obesity
noncontext = np.array(['MAMMOUSE','PAPTEST','BPMED','TEETHLOST','DENTAL']) 
# Already known effects of Obesity
known_effects_of_obesity = np.array(['CHD','BPHIGH','ARTHRITIS','DIABETES','HIGHCHOL','PHLTH','KIDNEY','STROKE'])
# Drop above two from our dataframe
census_data_Obesity = census_data_req.drop(np.concatenate([noncontext,known_effects_of_obesity]),axis=1)
census_data_Obesity.drop(['MHLTH','COREM','COPD','COLON_SCREEN'],axis=1,inplace=True)

# Contextualy irrelevant features for Cancer
noncontext_cancer = np.array(['MAMMOUSE','TEETHLOST','DENTAL']) 
# Drop above two from our dataframe
census_data_Cancer = census_data_req.drop(np.concatenate([noncontext_cancer]),axis=1)

# Divide the data into train and test - (75% - 25%)

In [18]:
# %load divide_test_train.py
# 1. For Obesity
y = census_data_Obesity['OBESITY']
X = census_data_Obesity.drop('OBESITY',axis=1)
# Separate data into train and test
Xtrain_obesity, Xtest_obesity, ytrain_obesity, ytest_obesity = train_test_split(X,y,test_size=.25)

#2. For Cancer
y = census_data_Cancer['CANCER']
X = census_data_Cancer.drop('CANCER',axis=1)
# Separate data into train and test
Xtrain_cancer, Xtest_cancer, ytrain_cancer, ytest_cancer = train_test_split(X,y,test_size=.25)

# Lets fit all the Features into our model.

# 1. Obesity

# OLS Regression Summary WITH the Outliers

In [19]:
m0_obesity = sm.OLS(ytrain_obesity,Xtrain_obesity).fit()
print(m0_obesity.summary())

                                 OLS Regression Results                                
Dep. Variable:                OBESITY   R-squared (uncentered):                   0.988
Model:                            OLS   Adj. R-squared (uncentered):              0.988
Method:                 Least Squares   F-statistic:                          8.649e+04
Date:                Thu, 03 Dec 2020   Prob (F-statistic):                        0.00
Time:                        15:18:54   Log-Likelihood:                          27144.
No. Observations:               13628   AIC:                                 -5.426e+04
Df Residuals:                   13615   BIC:                                 -5.416e+04
Df Model:                          13                                                  
Covariance Type:            nonrobust                                                  
                      coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------

# OLS Regression Summary after REMOVING the Outliers

In [20]:
# %load obesity_without_outlier
abnormal_resids_obesity = m0_obesity.resid[m0_obesity.resid <= -.10]

arr_obesity = np.empty(len(Xtrain_obesity.index.values),dtype=np.bool)
for i,j in enumerate(Xtrain_obesity.index.values):
    arr_obesity[i] = True if j in abnormal_resids_obesity.index.values else False

m1_obesity = sm.OLS(ytrain_obesity[~arr_obesity],Xtrain_obesity[~arr_obesity]).fit()

In [21]:
print('AIC Change: {:.4f}'.format(m1_obesity.aic-m0_obesity.aic))
print(m1_obesity.summary())

AIC Change: -859.1384
                                 OLS Regression Results                                
Dep. Variable:                OBESITY   R-squared (uncentered):                   0.989
Model:                            OLS   Adj. R-squared (uncentered):              0.989
Method:                 Least Squares   F-statistic:                          9.412e+04
Date:                Thu, 03 Dec 2020   Prob (F-statistic):                        0.00
Time:                        15:18:54   Log-Likelihood:                          27573.
No. Observations:               13551   AIC:                                 -5.512e+04
Df Residuals:                   13538   BIC:                                 -5.502e+04
Df Model:                          13                                                  
Covariance Type:            nonrobust                                                  
                      coef    std err          t      P>|t|      [0.025      0.975]
--------------

# 2. Cancer
# OLS Regression Summary WITH the Outliers

In [22]:
m0_cancer = sm.OLS(ytrain_cancer,Xtrain_cancer).fit()
print(m0_cancer.summary())

                                 OLS Regression Results                                
Dep. Variable:                 CANCER   R-squared (uncentered):                   0.996
Model:                            OLS   Adj. R-squared (uncentered):              0.996
Method:                 Least Squares   F-statistic:                          1.263e+05
Date:                Thu, 03 Dec 2020   Prob (F-statistic):                        0.00
Time:                        15:18:55   Log-Likelihood:                          56884.
No. Observations:               13628   AIC:                                 -1.137e+05
Df Residuals:                   13601   BIC:                                 -1.135e+05
Df Model:                          27                                                  
Covariance Type:            nonrobust                                                  
                      coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------

# OLS Regression Summary after REMOVING the Outliers

In [23]:
# %load cancer_without_outlier.py
abnormal_resids_cancer = m0_cancer.resid[m0_cancer.resid <= -.10]

arr_cancer = np.empty(len(Xtrain_cancer.index.values),dtype=np.bool)
for i,j in enumerate(Xtrain_cancer.index.values):
    arr_cancer[i] = True if j in abnormal_resids_cancer.index.values else False

m1_cancer = sm.OLS(ytrain_cancer[~arr_cancer],Xtrain_cancer[~arr_cancer]).fit()

In [24]:
print('AIC Change: {:.4f}'.format(m1_cancer.aic-m0_cancer.aic))
print(m1_cancer.summary())

AIC Change: 0.0000
                                 OLS Regression Results                                
Dep. Variable:                 CANCER   R-squared (uncentered):                   0.996
Model:                            OLS   Adj. R-squared (uncentered):              0.996
Method:                 Least Squares   F-statistic:                          1.263e+05
Date:                Thu, 03 Dec 2020   Prob (F-statistic):                        0.00
Time:                        15:18:55   Log-Likelihood:                          56884.
No. Observations:               13628   AIC:                                 -1.137e+05
Df Residuals:                   13601   BIC:                                 -1.135e+05
Df Model:                          27                                                  
Covariance Type:            nonrobust                                                  
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------

# Conclusions from Outlier Detections and Removal
1. It looks like the model AIC score has decreased (Obesity: -1336.6065, Cancer: 0.0).
2. R2 has increased just the slightest bit for Obesity but No change for Cancer.
3. It may be be concluded that removing this data was a good step for atleast predicting Obesity. For Cancer, it dosen't make any difference

# Feature Fitting Done, Now Run ML
# 1. Obesity
# a. Linear Regression

In [25]:
# %load r2_score.py
## Define r2 scoring for cross_val_score scoring
def scorer(model,X,y):
    model.fit(X,y)
    ypred = model.predict(X)
    return r2_score(y,ypred)

#INIT Linear Regression
lr = LinearRegression()

In [26]:
# %load print_linear_obesity.py
lr.fit(Xtrain_obesity[~arr_obesity],ytrain_obesity[~arr_obesity])
#print results
print('Model Accuracy For Obesity')
print('Linear Regression R2: {:.4f}'.format(lr.score(Xtrain_obesity[~arr_obesity],ytrain_obesity[~arr_obesity])))
print('Cross Fold Validation')
print('Linear Regression Cross Val Score: {}'.format(cross_val_score(lr,Xtrain_obesity[~arr_obesity],ytrain_obesity[~arr_obesity],cv=5,scoring=scorer)))

Model Accuracy For Obesity
Linear Regression R2: 0.8511
Cross Fold Validation
Linear Regression Cross Val Score: [0.85700822 0.84302178 0.84750619 0.85795912 0.85116168]


# b. Ridge Regression

Finding Optimum Alpha using Scikit_learn - GridSearchCV

In [27]:
# %load find_alpha_obesity.py
alphas = np.linspace(0,1,20)
params = {'alpha':alphas}

ridge = Ridge()
grid = GridSearchCV(ridge,params,cv=5)
grid.fit(Xtrain_obesity[~arr_obesity],ytrain_obesity[~arr_obesity])
best_alpha = grid.best_params_['alpha']
best_score = grid.best_score_
print('Best Alpha: {:.4f}'.format(best_alpha))
print('Best Score: {:.4f}'.format(best_score))

  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T


  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T


Best Alpha: 0.1053
Best Score: 0.8480


  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T


In [28]:
# %load print_ridge_obesity.py
print('Ridge Model Tested on Unseen Data')
ridge = Ridge(alpha=best_alpha)
ridge.fit(Xtrain_obesity[~arr_obesity],ytrain_obesity[~arr_obesity])
ypred = ridge.predict(Xtest_obesity)
print('Score on holdout data: {:.4f}'.format(r2_score(ytest_obesity,ypred)))

Ridge Model Tested on Unseen Data
Score on holdout data: 0.8400


  overwrite_a=True).T


# c. Random Forest

In [29]:
# %load print_random_forest_obesity.py
randforest= RandomForestRegressor()
randforest.fit(Xtrain_obesity[~arr_obesity],ytrain_obesity[~arr_obesity])
ypred = randforest.predict(Xtrain_obesity[~arr_obesity])
print('Random Forest R^2 Score on TRAIN DATA: {:.4f}'.format(r2_score(ytrain_obesity[~arr_obesity],ypred)))



Random Forest R^2 Score on TRAIN DATA: 0.9888


In [30]:
ypred = randforest.predict(Xtest_obesity)
print('Random Forst R^2 Score on TEST DATA: {:.4f}'.format(r2_score(ytest_obesity,ypred)))

Random Forst R^2 Score on TEST DATA: 0.9343


# Final TOP Features for Obesity

In [31]:
pd.DataFrame(randforest.feature_importances_,index=Xtrain_obesity.columns,columns=['Feature Importances']).sort_values('Feature Importances',ascending=False).head()

Unnamed: 0,Feature Importances
LPA,0.61759814
CSMOKING,0.196271322
SLEEP,0.038578966
index,0.029475434
Unnamed: 0,0.024117528


# 2. Cancer
# a. Linear Regression

In [32]:
# %load print_linear_cancer.py
#For Cancer
#Fit Linear Regression to the training data with outliers withheld
lr.fit(Xtrain_cancer[~arr_cancer],ytrain_cancer[~arr_cancer])

#print results
print('Model Accuracy For Cancer')
print('Linear Regression R2: {:.4f}'.format(lr.score(Xtrain_cancer[~arr_cancer],ytrain_cancer[~arr_cancer])))
print('Cross Fold Validation')
print('Linear Regression Cross Val Score: {}'.format(cross_val_score(lr,Xtrain_cancer[~arr_cancer],ytrain_cancer[~arr_cancer],cv=5,scoring=scorer)))

Model Accuracy For Cancer
Linear Regression R2: 0.9579
Cross Fold Validation
Linear Regression Cross Val Score: [0.959196   0.95929714 0.96112516 0.95659272 0.95647891]


# b. Ridge Regression

Finding Optimum Alpha using Scikit_learn - GridSearchCV

In [33]:
# %load find_alpha_cancer.py
alphas = np.linspace(0,1,20)
params = {'alpha':alphas}

ridge = Ridge()
grid = GridSearchCV(ridge,params,cv=5)
grid.fit(Xtrain_cancer[~arr_cancer],ytrain_cancer[~arr_cancer])
best_alpha = grid.best_params_['alpha']
best_score = grid.best_score_
print('Best Alpha: {:.4f}'.format(best_alpha))
print('Best Score: {:.4f}'.format(best_score))

  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T


Best Alpha: 0.0000
Best Score: 0.9575


  overwrite_a=True).T


In [34]:
# %load print_ridge_cancer.py
print('Ridge Model Tested on Unseen Data')
ridge = Ridge(alpha=best_alpha)
ridge.fit(Xtrain_cancer[~arr_cancer],ytrain_cancer[~arr_cancer])
ypred = ridge.predict(Xtest_cancer)
print('Score on holdout data: {:.4f}'.format(r2_score(ytest_cancer,ypred)))

Ridge Model Tested on Unseen Data
Score on holdout data: 0.8852


  overwrite_a=True).T


# c. Random Forest

In [35]:
# %load print_random_forest_cancer.py
randforest= RandomForestRegressor()
randforest.fit(Xtrain_cancer[~arr_cancer],ytrain_cancer[~arr_cancer])
ypred = randforest.predict(Xtrain_cancer[~arr_cancer])
print('Random Forest R^2 Score on TRAIN DATA: {:.4f}'.format(r2_score(ytrain_cancer[~arr_cancer],ypred)))



Random Forest R^2 Score on TRAIN DATA: 0.9936


In [36]:
ypred = randforest.predict(Xtest_cancer)
print('Random Forst R^2 Score on TEST DATA: {:.4f}'.format(r2_score(ytest_cancer,ypred)))

Random Forst R^2 Score on TEST DATA: 0.9669


# Final TOP Features for Cancer

In [37]:
pd.DataFrame(randforest.feature_importances_,index=Xtrain_cancer.columns,columns=['Feature Importances']).sort_values('Feature Importances',ascending=False).head()

Unnamed: 0,Feature Importances
CHOLSCREEN,0.446848009
ARTHRITIS,0.152755191
CHD,0.121554784
HIGHCHOL,0.101248463
SLEEP,0.044033938


# --------------------------ML Analysis Ends Here-----------------------------

# CONCLUSION from the PROJECT

1. We followed every step involved in a Datascience Project Lifecycle - Data Cleaning, EDA, Vizualizations, ML Analysis and Outcomes
2. Top Unhealthy behaviours which resulted in having OBESITY are:
    LOW PHYSICAL ACTIVITY, SMOKING and LACK OF SLEEP
3. Top Features which resulted in having CANCER are:
    LACK of CHOLESTROL SCREENING, ARTHRITIS and Heart Diseases

Thus, our results validates our hypothesis. 
Unheathy behaviours have a inverse relationship with the Chronic Diseases - Obesity and CANCER

Thank You..!