In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import numpy as np # linear algebra
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

In [2]:
training = pd.read_csv('train.csv')
test =  pd.read_csv('test.csv')

In [3]:
training['train_test'] = 1
test['train_test'] = 0
test['Survived'] = np.NaN
all_data = pd.concat([training,test])

%matplotlib inline
all_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'train_test'],
      dtype='object')

In [4]:
print(pd.pivot_table(training, index = 'Survived', columns = 'Pclass', values = 'Ticket' ,aggfunc ='count'))
print()
print(pd.pivot_table(training, index = 'Survived', columns = 'Sex', values = 'Ticket' ,aggfunc ='count'))
print()
print(pd.pivot_table(training, index = 'Survived', columns = 'Embarked', values = 'Ticket' ,aggfunc ='count'))

Pclass      1   2    3
Survived              
0          80  97  372
1         136  87  119

Sex       female  male
Survived              
0             81   468
1            233   109

Embarked   C   Q    S
Survived             
0         75  47  427
1         93  30  217


In [5]:
all_data.dropna(subset=['Embarked'],inplace = True)
all_data.Age = all_data.Age.fillna(training.Age.median())
all_data.Fare = all_data.Fare.fillna(training.Fare.median())

In [6]:
all_dummies = pd.get_dummies(all_data[['Pclass','Sex','Age','SibSp','Parch','Embarked', 'train_test']])

#Split the train and test datasets back again

X_train = all_dummies[all_dummies.train_test == 1].drop(['train_test'], axis =1)
X_test = all_dummies[all_dummies.train_test == 0].drop(['train_test'], axis =1)


y_train = all_data[all_data.train_test==1].Survived

In [7]:
scaler = StandardScaler()

dummies_scaled = all_dummies.copy()
dummies_scaled[['Age','SibSp','Parch']]= scaler.fit_transform(dummies_scaled[['Age','SibSp','Parch']])
dummies_scaled

X_train_scaled = dummies_scaled[dummies_scaled.train_test == 1].drop(['train_test'], axis =1)
X_test_scaled = dummies_scaled[dummies_scaled.train_test == 0].drop(['train_test'], axis =1)

y_train = all_data[all_data.train_test==1].Survived

In [8]:
model_rf = RandomForestRegressor(random_state = 1)
cv = cross_val_score(model_rf,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[0.19771085 0.34315923 0.38028862 0.3762476  0.46525119]
0.35253149738082173


In [9]:
model_rf.fit(X_train_scaled,y_train)
y = model_rf.predict(X_test_scaled).astype(int)
submission = {'PassengerId': test.PassengerId, 'Survived': y}
base_submission = pd.DataFrame(data=submission)
base_submission.to_csv('submission.csv', index=False)