### 1. Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
%matplotlib inline

### 2. Importing the Training dataset
people_ID, Region, Designation and Name will not effect the chances of getting infected so they are not treated as independent variables.

In [2]:
trainDataset = pd.read_excel("Train_dataset.xlsx")
X_TrainDataset = trainDataset.iloc[:,:-1]
y_TrainDataset = trainDataset.iloc[:,-1].values
X_TrainDataset.drop(X_TrainDataset.columns[[0, 1, 3, 4]], axis=1, inplace=True)

testDataset = pd.read_excel("Test_dataset.xlsx")

#testDataset = pd.read_csv("Test_Data_27.csv")  #FOR PREDICTING ON 27TH MARCH. # FIRST RUN Solution_part2.ipynb FOR CREATING Test_Data_27.csv

ids = testDataset['people_ID']
X_TestDataset = testDataset.iloc[:,:]
X_TestDataset.drop(X_TestDataset.columns[[0, 1, 3, 4]], axis=1, inplace=True)



### 3. Encoding the Dataset
Encoded Gender, Married, Occupation, Mode Transport, Comorbidity, Pulmonary score and cardiological pressure. Then drop their first column to avoid dummy variable trap.

In [3]:
X_TrainDatasetencoded = pd.get_dummies(X_TrainDataset, columns= X_TrainDataset.columns[[0, 1, 3, 4, 7, 10, 11]], drop_first=True)
X_TestDatasetencoded = pd.get_dummies(X_TestDataset, columns= X_TestDataset.columns[[0, 1, 3, 4, 7, 10, 11]], drop_first=True)

### 4. Fixing the missing values
For all missing values take mean value of that coloumn

In [4]:
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer = imputer.fit(X_TrainDatasetencoded)
X_TrainDatasetencoded = imputer.transform(X_TrainDatasetencoded)

imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer = imputer.fit(X_TestDatasetencoded)
X_TestDatasetencoded = imputer.transform(X_TestDatasetencoded)



### 5. Backend Elimination
Remove all the insignificant independent variables to optimize the model.
First append a column made of all 1's to encoded array to act as intercept.
Significance Level is set to 0.05

In [5]:
X_TrainDatasetencoded = np.append(arr = np.ones((X_TrainDatasetencoded.shape[0],1)).astype(int), values=X_TrainDatasetencoded, axis=1)
X_TestDatasetencoded = np.append(arr = np.ones((X_TestDatasetencoded.shape[0],1)).astype(int), values=X_TestDatasetencoded, axis=1)

sigLevel = 0.05
X_TrainOpt = X_TrainDatasetencoded[:,:]
X_TestOpt = X_TestDatasetencoded[:,:]
regressor_OLS = sm.OLS(endog=y_TrainDataset, exog = X_TrainOpt).fit()
pVals = regressor_OLS.pvalues
while pVals[np.argmax(pVals)] > sigLevel:
    X_TrainOpt = np.delete(X_TrainOpt, np.argmax(pVals), axis = 1)
    X_TestOpt = np.delete(X_TestOpt, np.argmax(pVals), axis = 1)
    print("pval of dim removed: " + str(np.argmax(pVals)))
    print(str(X_TrainOpt.shape[1]) + " dimensions remaining...")
    regressor_OLS = sm.OLS(endog = y_TrainDataset, exog = X_TrainOpt).fit()
    pVals = regressor_OLS.pvalues

print(regressor_OLS.summary())

pval of dim removed: 30
37 dimensions remaining...
pval of dim removed: 14
36 dimensions remaining...
pval of dim removed: 15
35 dimensions remaining...
pval of dim removed: 23
34 dimensions remaining...
pval of dim removed: 25
33 dimensions remaining...
pval of dim removed: 30
32 dimensions remaining...
pval of dim removed: 31
31 dimensions remaining...
pval of dim removed: 30
30 dimensions remaining...
pval of dim removed: 25
29 dimensions remaining...
pval of dim removed: 18
28 dimensions remaining...
pval of dim removed: 18
27 dimensions remaining...
pval of dim removed: 4
26 dimensions remaining...
pval of dim removed: 22
25 dimensions remaining...
pval of dim removed: 11
24 dimensions remaining...
pval of dim removed: 5
23 dimensions remaining...
pval of dim removed: 17
22 dimensions remaining...
pval of dim removed: 8
21 dimensions remaining...
pval of dim removed: 11
20 dimensions remaining...
pval of dim removed: 16
19 dimensions remaining...
pval of dim removed: 15
18 dimensi

### 6. Splitting the Train dataset into training and  validation set

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_validation , y_train, y_validation = train_test_split(X_TrainOpt, y_TrainDataset, test_size = 0.2 , random_state = 0)

### 7. Applying Random Forest Regression to the Train dataset

In [7]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor( n_estimators= 300, random_state = 0)
regressor.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=300,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

### 8. Calculating Scores

In [8]:
print(regressor.score(X_train,y_train), regressor.score(X_validation, y_validation))

0.8982767214752759 0.23302102454083173


### 9. Predicting Infaction probability for Test Dataset

In [9]:
y_TestDataset = regressor.predict(X_TestOpt)

### 10. Saving result file

In [10]:
df = pd.DataFrame(list(zip(ids, y_TestDataset)), columns =['people_ID', 'Infect_Prob']) 
csvFile = open("Part1.csv", 'a' ,encoding='utf-8')
df.to_csv(csvFile, index=False)