# Project 5 - Air Toxicity and Chronic Respiratory Diseases - Kaggel Competition

## Problem Statement

We will be developing a model that can accurately predict the incidence of chronic respiratory diseases based on air toxicity parameters. We will be submitting our results in the Air Toxicity and Chronic Respiratory Disease kaggle competition. ([Linked here](https://www.kaggle.com/competitions/air-toxicity-and-chronic-respiratory-diseases-us/data))
We believe that this analysis will be useful to epidemiologists in determining environmental settings that trigger respiratory diseases and estimating resources necessary to treat patients suffering from respiratory diseases.
We are planning on fitting various regression models on our data to predict disease incidence rates.


### Contents:
- [Train Data](#Train-Data)
- [Test Data](#Test-Data)
- [Imputation](#Imputation)
- [Modeling](#Modeling)
- [Conclusions and Recommendations](#Conclusions-and-Recommendations)

### Import library

In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score 
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor


## Train Data

In [2]:
train = pd.read_csv('../train_complete.csv')
train.head()

Unnamed: 0,ID,State_Name,Year,Age,Incidence,Arithmetic_Mean_mean_Lead_(TSP)_STP,Arithmetic_Mean_min_Lead_(TSP)_STP,Arithmetic_Mean_max_Lead_(TSP)_STP,Arithmetic_Mean_mean_Lead_(TSP)_STP_l2,Arithmetic_Mean_min_Lead_(TSP)_STP_l2,...,X1st_Max_Value_max_Benzene_l8,X1st_Max_Value_max_Ethylbenzene_l2,X1st_Max_Value_max_Ethylbenzene_l5,X1st_Max_Value_max_Ethylbenzene_l8,X1st_Max_Value_max_Toluene_l2,X1st_Max_Value_max_Toluene_l5,X1st_Max_Value_max_Toluene_l8,X1st_Max_Value_max_o-Xylene_l2,X1st_Max_Value_max_o-Xylene_l5,X1st_Max_Value_max_o-Xylene_l8
0,f8312a4,Alabama,1990,65-69,4685.284313,0.664543,0.0,7.96,1.071302,0.0,...,,,,,,,,,,
1,3effa36,Alabama,1990,70-74,4827.052043,0.664543,0.0,7.96,1.071302,0.0,...,,,,,,,,,,
2,1e8044b,Alabama,1990,75-79,4377.956914,0.664543,0.0,7.96,1.071302,0.0,...,,,,,,,,,,
3,d875d65,Alabama,1990,80-84,3822.732993,0.664543,0.0,7.96,1.071302,0.0,...,,,,,,,,,,
4,46e6695,Alabama,1990,85-89,3470.199503,0.664543,0.0,7.96,1.071302,0.0,...,,,,,,,,,,


In [3]:
train.shape

(4500, 200)

In [4]:
# Eliminate columns that have more than 20% missing values

train.dropna(thresh = train.shape[0]*0.8, axis = 1, inplace = True)

In [5]:
# create a column_keep to match column of train and test

column_keep = train.drop(columns='Incidence').columns

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4500 entries, 0 to 4499
Data columns (total 50 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   ID                                         4500 non-null   object 
 1   State_Name                                 4500 non-null   object 
 2   Year                                       4500 non-null   int64  
 3   Age                                        4500 non-null   object 
 4   Incidence                                  4500 non-null   float64
 5   Arithmetic_Mean_mean_Arsenic_PM2.5_LC      3960 non-null   float64
 6   Arithmetic_Mean_mean_Chromium_PM2.5_LC     3960 non-null   float64
 7   Arithmetic_Mean_mean_Lead_PM2.5_LC         3960 non-null   float64
 8   Arithmetic_Mean_mean_Manganese_PM2.5_LC    3960 non-null   float64
 9   Arithmetic_Mean_mean_Nickel_PM2.5_LC       3960 non-null   float64
 10  Arithmetic_Mean_min_Arse

In [7]:
train_dummy = pd.get_dummies(train, columns=['Age'], drop_first=True)
train_dummy.head()

Unnamed: 0,ID,State_Name,Year,Incidence,Arithmetic_Mean_mean_Arsenic_PM2.5_LC,Arithmetic_Mean_mean_Chromium_PM2.5_LC,Arithmetic_Mean_mean_Lead_PM2.5_LC,Arithmetic_Mean_mean_Manganese_PM2.5_LC,Arithmetic_Mean_mean_Nickel_PM2.5_LC,Arithmetic_Mean_min_Arsenic_PM2.5_LC,...,Arithmetic_Mean_min_Nitric_oxide_(NO)_l2,Arithmetic_Mean_max_Nitric_oxide_(NO)_l2,X1st_Max_Value_mean_Nitric_oxide_(NO)_l2,X1st_Max_Value_min_Nitric_oxide_(NO)_l2,X1st_Max_Value_max_Nitric_oxide_(NO)_l2,Age_70-74,Age_75-79,Age_80-84,Age_85-89,Age_90-94
0,f8312a4,Alabama,1990,4685.284313,,,,,,,...,,,,,,False,False,False,False,False
1,3effa36,Alabama,1990,4827.052043,,,,,,,...,,,,,,True,False,False,False,False
2,1e8044b,Alabama,1990,4377.956914,,,,,,,...,,,,,,False,True,False,False,False
3,d875d65,Alabama,1990,3822.732993,,,,,,,...,,,,,,False,False,True,False,False
4,46e6695,Alabama,1990,3470.199503,,,,,,,...,,,,,,False,False,False,True,False


## Test Data

In [8]:
test = pd.read_csv('../test_complete.csv')
test.head()

Unnamed: 0,ID,State_Name,Year,Age,Arithmetic_Mean_mean_Lead_(TSP)_STP,Arithmetic_Mean_min_Lead_(TSP)_STP,Arithmetic_Mean_max_Lead_(TSP)_STP,Arithmetic_Mean_mean_Lead_(TSP)_STP_l2,Arithmetic_Mean_min_Lead_(TSP)_STP_l2,Arithmetic_Mean_max_Lead_(TSP)_STP_l2,...,X1st_Max_Value_max_Benzene_l8,X1st_Max_Value_max_Ethylbenzene_l2,X1st_Max_Value_max_Ethylbenzene_l5,X1st_Max_Value_max_Ethylbenzene_l8,X1st_Max_Value_max_Toluene_l2,X1st_Max_Value_max_Toluene_l5,X1st_Max_Value_max_Toluene_l8,X1st_Max_Value_max_o-Xylene_l2,X1st_Max_Value_max_o-Xylene_l5,X1st_Max_Value_max_o-Xylene_l8
0,6c06615,Delaware,1990,65-69,,,,,,,...,,,,,,,,,,
1,e0cf76e,Delaware,1990,70-74,,,,,,,...,,,,,,,,,,
2,e9f95f0,Delaware,1990,75-79,,,,,,,...,,,,,,,,,,
3,5dffe44,Delaware,1990,80-84,,,,,,,...,,,,,,,,,,
4,9f9ea9b,Delaware,1990,85-89,,,,,,,...,,,,,,,,,,


In [9]:
test.shape

(4680, 199)

In [10]:
test = test[column_keep]

In [11]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4680 entries, 0 to 4679
Data columns (total 49 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   ID                                         4680 non-null   object 
 1   State_Name                                 4680 non-null   object 
 2   Year                                       4680 non-null   int64  
 3   Age                                        4680 non-null   object 
 4   Arithmetic_Mean_mean_Arsenic_PM2.5_LC      3852 non-null   float64
 5   Arithmetic_Mean_mean_Chromium_PM2.5_LC     3852 non-null   float64
 6   Arithmetic_Mean_mean_Lead_PM2.5_LC         3852 non-null   float64
 7   Arithmetic_Mean_mean_Manganese_PM2.5_LC    3852 non-null   float64
 8   Arithmetic_Mean_mean_Nickel_PM2.5_LC       3852 non-null   float64
 9   Arithmetic_Mean_min_Arsenic_PM2.5_LC       3852 non-null   float64
 10  Arithmetic_Mean_min_Chro

In [12]:
test_dummy = pd.get_dummies(test, columns=['Age'], drop_first=True)

In [13]:
test_numeric = test_dummy.drop(columns=['ID','State_Name'])

In [14]:
test_numeric.shape

(4680, 51)

## Imputation

### Imputing data using Iterative Imputer

In [15]:
# Create varible X, y

X = train_dummy.drop(columns=['Incidence','ID','State_Name'])
y = train_dummy['Incidence']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

## Modeling

In [17]:
# use thid funtion to save all the result 

In [18]:
def evaluation(X_test, y_test, preds, model):
    R2 = r2_score(y_test, preds)
    MSE = mean_squared_error(y_test, preds)
    RMSE = np.sqrt(mean_squared_error(y_test, preds))
    MAE = mean_absolute_error(y_test, preds)

    print(f'R2: {R2}')
    print(f'MSE: {MSE}')
    print(f'RMSE: {RMSE}')
    print(f'MAE: {MAE}')

    return {
        'R2': R2,
        'MSE': MSE,
        'RMSE': RMSE,
        'MAE': MAE
    }

In [19]:
scores =[]

### LinearRegrssion

In [20]:
lr = Pipeline([
    ("it_imp", IterativeImputer(estimator=LinearRegression(),max_iter=100,)),
    ("lr", LinearRegression())]
)

In [21]:
lr.fit(X_train,y_train)

In [22]:
lr.score(X_train,y_train), lr.score(X_test,y_test)

(0.7691887545836131, 0.75440765345233)

In [23]:
lr_preds = lr.predict(X_test)

In [24]:
MSE_lr = mean_squared_error(y_test,lr_preds)
MSE_lr

580214.0220337312

In [25]:
RMSE_lr = np.sqrt(mean_squared_error(y_test,lr_preds))
RMSE_lr

761.7178099754076

In [26]:
MAE_lr = mean_absolute_error(y_test,lr_preds)
MAE_lr

611.6642498916053

In [27]:
r2_score(y_test,lr_preds)

0.75440765345233

In [28]:
scores.append(evaluation(X_test,y_test,lr_preds,lr))

R2: 0.75440765345233
MSE: 580214.0220337312
RMSE: 761.7178099754076
MAE: 611.6642498916053


In [None]:
test_preds_lr = lr.predict(test_numeric)

In [None]:
test_preds_lr = pd.DataFrame({
    'ID': test['ID'],
    'Incidence': test_preds_lr,
})

test_preds_lr.head()

In [None]:
#test_preds_lr.to_csv('../lr_model_3.csv', index=False)

### RandomforestRegressor

In [29]:
rf = Pipeline([
    ("it_imp", IterativeImputer(max_iter=100)),
    ("ss", StandardScaler()),
    ("rf", RandomForestRegressor())]
)

In [30]:
rf.fit(X_train, y_train)

In [31]:
rf.score(X_train, y_train), rf.score(X_test,y_test)

(0.9966791883752673, 0.9760967077312056)

In [32]:
rf_preds = rf.predict(X_test)

In [33]:
MSE_rf = mean_squared_error(y_test,rf_preds)
MSE_rf

56471.73269885659

In [34]:
RMSE_rf = np.sqrt(MSE_rf)
RMSE_rf

237.63781832624323

In [35]:
MAE_rf = mean_absolute_error(y_test,rf_preds)
MAE_rf

167.15143165795064

In [36]:
scores.append(evaluation(X_test,y_test,rf_preds,rf))

R2: 0.9760967077312056
MSE: 56471.73269885659
RMSE: 237.63781832624323
MAE: 167.15143165795064


#### Predict test data

In [24]:
X_test.shape

(1125, 51)

In [25]:
test_numeric.shape

(4680, 51)

In [26]:
test_preds_rf = rf.predict(test_numeric)

In [27]:
test_preds_rf = pd.DataFrame({
    'ID': test['ID'],
    'Incidence': test_preds_rf,
})

test_preds_rf.head()

Unnamed: 0,ID,Incidence
0,6c06615,4692.130641
1,e0cf76e,4823.983741
2,e9f95f0,4462.167361
3,5dffe44,3811.22911
4,9f9ea9b,3463.300962


In [28]:
#test_preds_rf.to_csv('../rf_model_4.csv', index=False)

### AdaBoost Regressor

In [100]:
abr = Pipeline([
    ("it_imp", IterativeImputer(max_iter=100)),
    #("ss",StandardScaler()),
    ("abr", AdaBoostRegressor(random_state=42,
                              estimator=DecisionTreeRegressor()))]
)

In [139]:
abr_pipe_params = {
    'abr__n_estimators' : [100]} #[150,200,300,400] 

In [140]:
abr_gs = GridSearchCV(abr, abr_pipe_params) #cv=5)

In [141]:
abr_gs.fit(X_train,y_train)

In [142]:
abr_gs.score(X_train,y_train), abr_gs.score(X_test,y_test)

(0.9996045870239607, 0.9757852078682748)

In [143]:
abr_gs.best_params_

{'abr__n_estimators': 100}

In [144]:
test_preds_abr = abr_gs.predict(test_numeric)

In [145]:
test_preds_abr = pd.DataFrame({
    'ID': test['ID'],
    'Incidence': test_preds_abr,
})

test_preds_abr.head()

Unnamed: 0,ID,Incidence
0,6c06615,4694.653998
1,e0cf76e,4810.17461
2,e9f95f0,4487.052247
3,5dffe44,3828.460268
4,9f9ea9b,3483.560462


In [147]:
test_preds_abr.to_csv('../abr_model.csv', index=False)

### ExtraTreeRegressor

In [120]:
etr = Pipeline([
    ("it_imp", IterativeImputer(max_iter=100)),
    ("ss", StandardScaler()),
    ("etc", ExtraTreesRegressor())]
)

In [134]:
etr_pipe_params = {
    'etc__n_estimators' :[100], #[200,300,400],
    'etc__max_depth' : [None] #[2,3,4]
} 

In [135]:
etr_gs = GridSearchCV(etr, etr_pipe_params, cv=5)

In [136]:
etr_gs.fit(X_train,y_train)

In [137]:
etr_gs.score(X_train,y_train), etr_gs.score(X_test,y_test)

(0.9998924230694293, 0.9765980401108488)

In [138]:
etr_gs.best_params_

{'etc__max_depth': None, 'etc__n_estimators': 100}