In [None]:
import numpy as np
import pandas as pd

Remember to load the titanic data csv file inside this session. Can be found at https://drive.google.com/file/d/1T3E1M6g0CIqKdeJg3Ze77hSf9I_FPBKL/view?usp=sharing


In [None]:
X = pd.read_csv('sample_data/titanic_train_data.csv')

#Remove all rows with missing data
X.dropna(inplace=True)

# Set y (labels) to be the survival column
y = X.Survived

# set X (data) to be everything except the 'survived' column
X.drop(['Survived'],axis=1,inplace=True)

In [None]:
X.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
6,7,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
10,11,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S
11,12,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,C103,S


In [None]:
# Get the shape of the data frame. As can be seen, there is 12 categories and 891 data 
np.shape(X)

(183, 11)

In [None]:
# Get the columns of the data frame
X.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test , y_train , y_test = train_test_split(X,y,train_size = 0.8)

In [None]:
np.shape(X_test)

(37, 11)

In [None]:
#Get column names where its column contains categorical data and contains less than 10 unique elements (Less than 10 different values for that column)
categorical_cols = [cname for cname in X_train.columns if X_train[cname].nunique() < 10 and X_train[cname].dtype == "object"]


#Get column names where its column contains numerical data
numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]


#Get columns you want to keep (Basically the predefined categorical and numerical columns)
keep_cols = categorical_cols + numerical_cols


# Change train, val and test to have only the columns to keep
X_train = X_train[keep_cols]
X_test = X_test[keep_cols]

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error , accuracy_score
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from xgboost import XGBRegressor

In [None]:
#Preprocess all numerical (Impute) and categorical (Impute + OHE) data

# Preprocessing for numerical data (Imputting;filling up missing values)
process_numerical = SimpleImputer(strategy='constant')

#Preprocess for categorical data (OHE and Imputer for string(Use the 'most_frequent' strategy, this replaces all missing values with the most frequent value))
process_categorical = Pipeline(steps=[
                                      ('imputer',SimpleImputer(strategy='most_frequent')),
                                      (('One Hot Encoder'), OneHotEncoder(handle_unknown='ignore'))
])


#Bundle up the preprocessing for categorical and numerical 
#the tuple inside transformers represents (Name of transformation, transformation you want to do, columns you want to apply transformation on )
preprocessor = ColumnTransformer(
    transformers=[
                  ('numerical',process_numerical,numerical_cols),
                  ('categorical',process_categorical, categorical_cols)
    ]
)

In [None]:
# iterate through a list of random states and return the random state that gives the max accuracy. You can replace this with any other hyperparameter to get the values that give a better 
def get_rs_acc(rstate):
    model = RandomForestClassifier(random_state=rstate)
    final = Pipeline(steps=[
    ('preprocessing',preprocessor),
    ('model',model)
    ])
    final.fit(X_train,y_train)
    pred = final.predict(X_test)
    acc = accuracy_score(y_test,pred)
    return acc

rstates = [i for i in range(0,100)]   #Candidate of the random states. From values 0 to 99

scores = {randstates:get_rs_acc(randstates) for randstates in rstates}
print(scores)
best_rstate = max(scores,key=scores.get)
print("Best random state:" ,best_rstate)

{0: 0.7567567567567568, 1: 0.7027027027027027, 2: 0.7567567567567568, 3: 0.7567567567567568, 4: 0.7297297297297297, 5: 0.7297297297297297, 6: 0.7567567567567568, 7: 0.7297297297297297, 8: 0.7297297297297297, 9: 0.7297297297297297, 10: 0.7567567567567568, 11: 0.7567567567567568, 12: 0.7837837837837838, 13: 0.7297297297297297, 14: 0.7297297297297297, 15: 0.7567567567567568, 16: 0.7567567567567568, 17: 0.7567567567567568, 18: 0.7567567567567568, 19: 0.7027027027027027, 20: 0.7837837837837838, 21: 0.7567567567567568, 22: 0.7297297297297297, 23: 0.7837837837837838, 24: 0.7567567567567568, 25: 0.7837837837837838, 26: 0.7297297297297297, 27: 0.7567567567567568, 28: 0.7297297297297297, 29: 0.7567567567567568, 30: 0.7567567567567568, 31: 0.7837837837837838, 32: 0.7837837837837838, 33: 0.7297297297297297, 34: 0.7567567567567568, 35: 0.7567567567567568, 36: 0.7567567567567568, 37: 0.7567567567567568, 38: 0.7297297297297297, 39: 0.7297297297297297, 40: 0.7567567567567568, 41: 0.7837837837837838, 4

In [None]:
def get_estimator_acc(estimators):
    model = RandomForestClassifier(n_estimators=estimators,random_state=11)
    final = Pipeline(steps=[
    ('preprocessing',preprocessor),
    ('model',model)
    ])
    final.fit(X_train,y_train)
    pred = final.predict(X_test)
    acc = accuracy_score(y_test,pred)
    return acc

estimators = [i*50 for i in range(1,100)]   #Candidate of the random states

scores = {est:get_estimator_acc(est) for est in estimators}
print(scores)
best_est = max(scores,key=scores.get)
print("Best no. of estimators:" ,best_est)

{50: 0.7567567567567568, 100: 0.7567567567567568, 150: 0.7567567567567568, 200: 0.7567567567567568, 250: 0.7567567567567568, 300: 0.7567567567567568, 350: 0.7567567567567568, 400: 0.7567567567567568, 450: 0.7567567567567568, 500: 0.7567567567567568, 550: 0.7567567567567568, 600: 0.7567567567567568, 650: 0.7567567567567568, 700: 0.7567567567567568, 750: 0.7567567567567568, 800: 0.7567567567567568, 850: 0.7567567567567568, 900: 0.7567567567567568, 950: 0.7567567567567568, 1000: 0.7567567567567568, 1050: 0.7567567567567568, 1100: 0.7567567567567568, 1150: 0.7567567567567568, 1200: 0.7567567567567568, 1250: 0.7567567567567568, 1300: 0.7567567567567568, 1350: 0.7567567567567568, 1400: 0.7567567567567568, 1450: 0.7567567567567568, 1500: 0.7567567567567568, 1550: 0.7567567567567568, 1600: 0.7567567567567568, 1650: 0.7567567567567568, 1700: 0.7567567567567568, 1750: 0.7567567567567568, 1800: 0.7567567567567568, 1850: 0.7567567567567568, 1900: 0.7567567567567568, 1950: 0.7567567567567568, 2000:

In [None]:
model = RandomForestClassifier(n_estimators=best_est, random_state=best_rstate)
final = Pipeline(steps=[
                        ('preprocessing',preprocessor),
                        ('model',model)
])
final.fit(X_train,y_train)
pred = final.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'Accuracy is: {acc}')
print(pred)

Accuracy is: 0.7837837837837838
[1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0 0 1 0 1 0 1 1 1 1 0 0 0 1 1 0]
