In [1]:
# all neccesarry imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression, LinearRegression

In [2]:
# reads in the training data and prints out first 5 obs

train = pd.read_csv('train.csv')

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# creates dummy variables for categorical data

train_dummies = pd.get_dummies(train[['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp',
       'Parch', 'Embarked']], columns=['Sex','Embarked'] ,drop_first=True)

train_dummies.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Sex_male,Embarked_Q,Embarked_S
0,1,0,3,22.0,1,0,1,0,1
1,2,1,1,38.0,1,0,0,0,0
2,3,1,3,26.0,0,0,0,0,1
3,4,1,1,35.0,1,0,0,0,1
4,5,0,3,35.0,0,0,1,0,1


In [4]:
# Creates a new dataframe to run a regression on Age: Pclass, Sex, SibSp in order to fill empty ages

final_age = train_dummies[['Age','Pclass','Sex_male','SibSp']].dropna()

In [5]:
age_model = LinearRegression()
age_model.fit(final_age[['Pclass','Sex_male','SibSp']],final_age['Age'])
age_model.score(final_age[['Pclass','Sex_male','SibSp']],final_age['Age'])

0.23150198524000787

In [6]:
final_age.head()

Unnamed: 0,Age,Pclass,Sex_male,SibSp
0,22.0,3,1,1
1,38.0,1,0,1
2,26.0,3,0,0
3,35.0,1,0,1
4,35.0,3,1,0


In [7]:
train_dummies['age_adj'] = age_model.predict(train_dummies[['Pclass','Sex_male','SibSp']])

In [8]:
train_dummies.Age.fillna(train_dummies['age_adj'], inplace=True)

In [9]:
train_final = train_dummies.drop(['age_adj'],axis=1)

In [10]:
train_final.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Sex_male,Embarked_Q,Embarked_S
0,1,0,3,22.0,1,0,1,0,1
1,2,1,1,38.0,1,0,0,0,0
2,3,1,3,26.0,0,0,0,0,1
3,4,1,1,35.0,1,0,0,0,1
4,5,0,3,35.0,0,0,1,0,1


In [11]:
from itertools import combinations

ind_var = ['Pclass', 'Age', 'SibSp', 'Parch', 'Sex_male', 'Embarked_Q', 'Embarked_S']

In [12]:
from sklearn.model_selection import train_test_split

In [34]:
from collections import Counter

In [50]:
best_fit = 0
for i in range(1,8):
    comb = combinations(ind_var, i)
    for i in comb:
        x_train,x_test,y_train,y_test = train_test_split(train_final[list(i)], train_final.Survived, 
                                                         test_size=0.2, random_state=10)
        model = LogisticRegression(solver='lbfgs')
        model.fit(x_train, y_train)
        fit = model.score(x_test, y_test)
        if fit > best_fit:
            best_fit = fit
            best_comb = i
print('Correlation: {}, Combo: {}'.format(best_fit,best_comb))

Correlation: 0.8324022346368715, Combo: ('Pclass', 'Age', 'Parch', 'Sex_male', 'Embarked_Q')


In [14]:
test = pd.read_csv('test.csv')

test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [15]:
test_dummies = pd.get_dummies(test[['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp',
       'Parch', 'Embarked']], columns=['Sex','Embarked'] ,drop_first=True)

test_dummies.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Sex_male,Embarked_Q,Embarked_S
0,892,3,34.5,0,0,1,1,0
1,893,3,47.0,1,0,0,0,1
2,894,2,62.0,0,0,1,1,0
3,895,3,27.0,0,0,1,0,1
4,896,3,22.0,1,1,0,0,1


In [16]:
test_dummies['age_adj'] = age_model.predict(test_dummies[['Pclass','Sex_male','SibSp']])

In [17]:
test_dummies.Age.fillna(test_dummies['age_adj'], inplace=True)

In [18]:
test_final = test_dummies.drop(['age_adj'],axis=1)

In [19]:
test_final.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Sex_male,Embarked_Q,Embarked_S
0,892,3,34.5,0,0,1,1,0
1,893,3,47.0,1,0,0,0,1
2,894,2,62.0,0,0,1,1,0
3,895,3,27.0,0,0,1,0,1
4,896,3,22.0,1,1,0,0,1


In [20]:
model = LogisticRegression(solver='lbfgs')
model.fit(train_final[['Pclass', 'Age', 'Parch', 'Sex_male', 'Embarked_Q']], train_final.Survived)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
survived = model.predict(test_final[['Pclass', 'Age', 'Parch', 'Sex_male', 'Embarked_Q']])

In [22]:
test_final['Survived'] = survived

In [23]:
test_final.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Sex_male,Embarked_Q,Embarked_S,Survived
0,892,3,34.5,0,0,1,1,0,0
1,893,3,47.0,1,0,0,0,1,0
2,894,2,62.0,0,0,1,1,0,0
3,895,3,27.0,0,0,1,0,1,0
4,896,3,22.0,1,1,0,0,1,1


In [26]:
final = test_final[['PassengerId', 'Survived']]

In [30]:
final.set_index('PassengerId', inplace=True)

In [32]:
final.to_csv('titanic.csv')

In [41]:
best = Counter()

In [45]:
best['a'] += 1

In [46]:
best

Counter({'a': 2})