In [1]:
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression

In [2]:
train = pd.read_csv('train.csv')


In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
def clean1(df1):
    df = df1.copy()
    df = df.drop(['Name','Ticket'], axis =1)
    HC = df['Cabin'].notna().astype(int) #To check if the passenger had a cabin
    SN = (df['Sex'] == 'male').astype(int)
    EmbarkedS = (df['Embarked'] == 'S').astype(int)
    EmbarkedC = (df['Embarked'] == 'C').astype(int)
    EmbarkedQ = (df['Embarked'] == 'Q').astype(int)
    df = df.drop(['Cabin','Sex','Embarked'], axis = 1) #Drop non numeric values to replace them with numeric values
    df['HasCabin'] = HC    #1 if the person has a cabin
    df['SexNumberic'] = SN  #1 if male, 0 if female
    df['EmbarkedQ'] = EmbarkedQ #1 if embarked in Q
    df['EmbarkedC'] = EmbarkedC #1 if embarked in C
    #df['EmbarkedS'] = EmbarkedS #1 if embarked in S not used because there is three options, so two columns are needed
    df = df.fillna(df.mean()) #Replace missing values in age with the average age in the category, it is hard to predict this otherwise
    return df
    

In [5]:
#Unused code to predict the missing age values with linear regression of the other values.

def linregage(df1):
    df = df1.copy()
    age = df['Age'].notna()
    df2 = df[age]
    xl = df2.drop(['Survived','PassengerId','Age'],axis = 1)
    yl = df2['Age']
    reg = LinearRegression().fit(xl, yl)
    print(reg.score(xl,yl))
    noage = df['Age'].isna()
    df3 = df[noage]    
    xl2 = df3.drop(['Survived','PassengerId','Age'],axis = 1)
    p = reg.predict(xl2)
    
    return p


In [6]:
#First try with dropping any remaining na values
train = clean1(train)
train = train.dropna()

In [7]:
x = train.drop(['Survived','PassengerId'],axis = 1)
y = train['Survived']

In [8]:
#Upon first data inspection the fact wether or not someone has a cabin and if someone is in first class seem to be highly
#Correlated, this cell is used to see if the total amount of people in first class and amount of people with a cabin are the
#same. Upon further inspection this does not always seem to be the case and there also seem to be people in lower classes
#with a cabin.
c = np.sum(x['HasCabin']) #Sum of people that have a cabin
c2 = np.sum(x['Pclass'] == 1) #Sum of people in first class
c3 = np.sum(x['Pclass'] == train['HasCabin'])
print('Amount of people that have a cabin: ' + str(c))
print('Amount of people that are in first class: ' + str(c2))
print('Amount of poeple that are in first class and have a cabin: ' + str(c3))

Amount of people that have a cabin: 204
Amount of people that are in first class: 216
Amount of poeple that are in first class and have a cabin: 176


In [9]:
model = LogisticRegression(max_iter = 1000)  #Creating the logistic regression model
rf = RandomForestClassifier(max_depth=7, random_state=0) #Creating the random forest model

In [10]:
model.fit(x,y)  #Fitting the logistic regression model
rf.fit(x,y)  #Fitting the random forest

RandomForestClassifier(max_depth=7, random_state=0)

In [11]:
test =  pd.read_csv('test.csv')
test = clean1(test)
test = test.dropna()
xt = test.drop(['PassengerId'],axis = 1)
rt = model.predict(xt)   #Logistic regression model prediction
rfm = rf.predict(xt)     #Random forest model prediction

In [12]:
l1 = []
l2 = []
niter = 50
#Iterate over various options of splits and take the average to see the expected test result
for i in range(niter):
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=i)
    model2 = LogisticRegression(max_iter = 1000)
    model2.fit(X_train,y_train)
    l1.append(model2.score(X_test,y_test))

    rf2 = RandomForestClassifier(max_depth=2, random_state=0)
    rf2.fit(X_train,y_train)
    l2.append(rf2.score(X_test,y_test))

print('Logistic regression test score: ' + str(np.mean(l1)))   #test in the trianing data after a split
print('Random forest test score: ' + str(np.mean(l2)))     #Test the random forest model

Logistic regression test score: 0.7980338983050846
Random forest test score: 0.7823728813559321


In [22]:
#logistic regression seems to be performing better so this is used for the final answer:
rdf = pd.DataFrame(test['PassengerId'])
rdf['Survived'] = rfm
rdf.to_csv('test2.csv', index=False)

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,HasCabin,SexNumberic,EmbarkedQ,EmbarkedC
0,892,3,34.50000,0,0,7.8292,0,1,1,0
1,893,3,47.00000,1,0,7.0000,0,0,0,0
2,894,2,62.00000,0,0,9.6875,0,1,1,0
3,895,3,27.00000,0,0,8.6625,0,1,0,0
4,896,3,22.00000,1,1,12.2875,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
413,1305,3,30.27259,0,0,8.0500,0,1,0,0
414,1306,1,39.00000,0,0,108.9000,1,0,0,1
415,1307,3,38.50000,0,0,7.2500,0,1,0,0
416,1308,3,30.27259,0,0,8.0500,0,1,0,0
