In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# Loading the dataset

In [38]:
df = pd.read_csv('train.csv')

# Some preprocessing steps

In [39]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [40]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [41]:
df_test = pd.read_csv('test.csv')

In [42]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S


In [43]:
df_test.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Embarked'],
      dtype='object')

In [44]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [45]:
df['Age'].isna().sum()

0

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 889 entries, 0 to 888
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Name         889 non-null    object 
 4   Sex          889 non-null    object 
 5   Age          889 non-null    float64
 6   SibSp        889 non-null    int64  
 7   Parch        889 non-null    int64  
 8   Ticket       889 non-null    object 
 9   Fare         889 non-null    float64
 10  Cabin        202 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.5+ KB


# LabelEncoder for converting text to numerical

In [47]:
from sklearn import preprocessing

In [48]:
label_encoder = preprocessing.LabelEncoder()

In [49]:
new_age_var = label_encoder.fit_transform(df['Sex'])

In [50]:
df['Sex'] = new_age_var


In [51]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S


In [52]:
new_Embarked_var = label_encoder.fit_transform( df['Embarked'] )

In [53]:
df['Embarked'] = new_Embarked_var

In [54]:
predictors = pd.DataFrame([df['Age'],df['Embarked'],df['Fare'],df['Pclass'],df['Sex']]).T

In [55]:
predictors.head()

Unnamed: 0,Age,Embarked,Fare,Pclass,Sex
0,22.0,2.0,7.25,3.0,1.0
1,38.0,0.0,71.2833,1.0,0.0
2,26.0,2.0,7.925,3.0,0.0
3,35.0,2.0,53.1,1.0,0.0
4,35.0,2.0,8.05,3.0,1.0


# Decision Tree model
* 1Dv and many IDV

In [56]:
#importing the tree
from sklearn import  tree

In [57]:
#initializing the DT
tree_model = tree.DecisionTreeClassifier()

In [58]:
tree_model.fit(X=predictors,y=df['Survived'])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

# Creating DT graph for visualizing

In [60]:
#It will write all Dtree details in a file
# which can be used to visualizing
with open('Dtree1.dot','w') as f:
  f = tree.export_graphviz(tree_model,feature_names=['Sex','Pclass','Fare','Age','Embarked'],out_file=f);

# Model Accuracy

In [62]:
tree_model.score(X=predictors,y=df['Survived'])

0.9775028121484814

# RandomForest Classifier

* this is used to get the most efficient columns that required for building DT

In [63]:
from sklearn.ensemble import RandomForestClassifier

In [68]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [64]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,2


In [67]:
rf_model = RandomForestClassifier(n_estimators=1000,max_features=2,oob_score=True)

In [75]:
features = [ 'Pclass', 'Sex', 'Age', 'SibSp',
       'Parch', 'Fare',  'Embarked']

In [76]:
rf_model.fit(X=df[features],y=df['Survived'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=2, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [77]:
print("OOB SCORE ")
print(rf_model.oob_score_)

OOB SCORE 
0.8031496062992126


In [80]:
print("Features According to the percentange\n")
for feature,imp in zip (features,rf_model.feature_importances_):
    print(feature,imp)

Features According to the percentange

Pclass 0.08360105759802879
Sex 0.2614791468132952
Age 0.25711546546947456
SibSp 0.05006178607559917
Parch 0.0394126158361081
Fare 0.27376318173012193
Embarked 0.03456674647737242


# IMP - Features

* MAX the score more the imp is variable

* so From Above Table Sex,Age and Fare is the important among all

* Fare is the most imp Variable
