In [1]:
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier

# Project 1: Build Decision Tree(DV-"Survived",IDV-"Age,Gender and Fare") and Prediction

In [2]:
df_train = pd.read_csv('./Dataset/train.csv')
df_test = pd.read_csv('./Dataset/test.csv')

In [3]:
train_age = np.where(df_train.Age.isnull(), df_train.Age.mean(), df_train.Age)
test_age = np.where(df_test.Age.isnull(), df_test.Age.mean(), df_test.Age)

In [4]:
label_en = preprocessing.LabelEncoder()

In [5]:
train_gender = label_en.fit_transform(df_train.Sex)
test_gender = label_en.fit_transform(df_test.Sex)

In [6]:
features = pd.DataFrame([train_age, df_train.Fare, train_gender]).T

In [7]:
model1 = tree.DecisionTreeClassifier(max_depth = 6)
model1.fit(X = features, y = df_train.Survived)

DecisionTreeClassifier(max_depth=6)

In [8]:
model1.score(X = features, y = df_train.Survived)

0.829021372328459

#### Inference
> We get a score of 82.90 % when the features are Age, Fare and Gender

### Predication

In [9]:
features_test = pd.DataFrame([test_age, df_test.Fare, test_gender]).T

In [10]:
pred = model1.predict(X = features_test)

In [11]:
pred_output = pd.DataFrame({"P_Id": df_test.PassengerId, "Survived" : pred})

In [12]:
pred_output.head()

Unnamed: 0,P_Id,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [13]:
with open('DecisionTree1.dot','w') as file:
    file = tree.export_graphviz(model1, feature_names=['AGE','FARE','GENDER'], out_file= file)

# Project 2: Build Decision Tree for Attrition Rate Analysis

- DV - "Attrition"
- IDV - Output of RF Algorithm


In [14]:
df = pd.read_csv('./Dataset/general_data.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,4.0,Y,12,8,2,9.0,2,6,0,4


In [15]:
df['NumCompaniesWorked'] = np.where(df.NumCompaniesWorked.isnull(), df.NumCompaniesWorked.mean(), df.NumCompaniesWorked )
df['TotalWorkingYears'] = np.where(df.TotalWorkingYears.isnull(), df.TotalWorkingYears.mean(), df.TotalWorkingYears )

In [16]:
le = preprocessing.LabelEncoder()

In [17]:
df_temp = df.copy()

In [18]:
df_temp['Attrition'] = le.fit_transform(df.Attrition)
df_temp['BusinessTravel'] = le.fit_transform(df.BusinessTravel)
df_temp['Department'] = le.fit_transform(df.Department)
df_temp['EducationField'] = le.fit_transform(df.EducationField)
df_temp['Gender'] = le.fit_transform(df.Gender)
df_temp['JobRole'] = le.fit_transform(df.JobRole)
df_temp['MaritalStatus'] = le.fit_transform(df.MaritalStatus)
df_temp['Over18'] = le.fit_transform(df.Over18)

In [19]:
model2 = RandomForestClassifier(n_estimators = 2000, max_features = 2, oob_score = True)

In [20]:
attr_train_feats = ['Age', 'BusinessTravel', 'Department', 'DistanceFromHome', 'Education', 'EducationField', 
                'Gender', 'JobLevel', 'JobRole', 'MaritalStatus', 'MonthlyIncome', 'NumCompaniesWorked', 
                'Over18', 'PercentSalaryHike', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 
                'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager']

In [21]:
model2.fit(X = df_temp[attr_train_feats], y = df_temp.Attrition)

RandomForestClassifier(max_features=2, n_estimators=2000, oob_score=True)

In [22]:
model2.oob_score_

1.0

In [23]:
for train_feat, imp in zip(attr_train_feats, model2.feature_importances_):
    print(train_feat, '-->', imp)

Age --> 0.09653701804896832
BusinessTravel --> 0.028231158156348784
Department --> 0.02614486037619103
DistanceFromHome --> 0.06941763395022586
Education --> 0.04092772511509695
EducationField --> 0.041691429984991854
Gender --> 0.018092948362488318
JobLevel --> 0.038000399522613526
JobRole --> 0.055357748946717376
MaritalStatus --> 0.039766649861296514
MonthlyIncome --> 0.09365798926800609
NumCompaniesWorked --> 0.05637867257798392
Over18 --> 0.0
PercentSalaryHike --> 0.06584387009439882
StockOptionLevel --> 0.03396665889955561
TotalWorkingYears --> 0.08545333721730525
TrainingTimesLastYear --> 0.04459444683521817
YearsAtCompany --> 0.0683377831503812
YearsSinceLastPromotion --> 0.043388936323524994
YearsWithCurrManager --> 0.054210733308687496


#### Inference

- Looking at the importance score for each feature we can conclude that features like
> Age, MonthlyIncome, TotalWorkingYears, DistanceFromHome, PercentSalaryHike, YearsAtCompany seems to be most importance among other features.

In [24]:
attr_test_feat = ['Age','DistanceFromHome', 'MonthlyIncome', 'PercentSalaryHike', 'TotalWorkingYears', 'YearsAtCompany']

In [25]:
model3 = tree.DecisionTreeClassifier(max_depth = 12)

In [26]:
model3.fit(X = df_temp[attr_test_feat], y = df_temp.Attrition)

DecisionTreeClassifier(max_depth=12)

In [27]:
model3.score(X = df_temp[attr_test_feat], y = df_temp.Attrition)

0.9480725623582766

#### Inference
> With score of 94.80% we can say that Age, MonthlyIncome, TotalWorkingYears, DistanceFromHome, PercentSalaryHike, YearsAtCompany features play important role in attrition level in the company.

# 3. Build Decision Tree for Bank Loan Modelling

- DV - "Personal Loan"
- IDV - Output of RF Algorithm


In [28]:
df3 = pd.read_excel('./Dataset/Bank_Personal_Loan_Modelling.xlsx', sheet_name='Data')
df3.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


In [29]:
model4 = RandomForestClassifier(n_estimators = 2000, max_features = 2 , oob_score = True)

In [30]:
bank_train_feats =[ 'Age', 'Experience', 'Income', 'Family', 'CCAvg',
       'Education', 'Mortgage', 'Securities Account',
       'CD Account', 'Online', 'CreditCard']

In [31]:
model4.fit(X = df3[bank_train_feats], y = df3['Personal Loan'])

RandomForestClassifier(max_features=2, n_estimators=2000, oob_score=True)

In [32]:
model4.oob_score_

0.9874

In [33]:
for train_feat, imp in zip(bank_train_feats, model4.feature_importances_):
    print(train_feat, '-->', imp)

Age --> 0.04543890542368973
Experience --> 0.04488019592981263
Income --> 0.3398983275775426
Family --> 0.09856928174605668
CCAvg --> 0.18344992274428149
Education --> 0.16548867548273954
Mortgage --> 0.04449630093711763
Securities Account --> 0.00538022988611044
CD Account --> 0.054062389238512865
Online --> 0.008487388665030519
CreditCard --> 0.009848382369105818


#### Inference
> We can obsevrve that Age, CCAvg, Education, Family, CD Account, Mortage seem to be important features 

In [34]:
bank_test_feats = ['Family', 'CCAvg', 'Education', 'CD Account','Age','Mortgage']

In [35]:
model5 = tree.DecisionTreeClassifier(max_depth = 12)

In [36]:
model5.fit(X = df3[bank_test_feats], y = df3['Personal Loan'])

DecisionTreeClassifier(max_depth=12)

In [37]:
model5.score(X = df3[bank_test_feats], y = df3['Personal Loan'])

0.9866

#### Inference
> With score of 98.66% we can say that Income, CCAvg, Education, Family , CD Account features play important role in deciding whether the person will get a loan or not.