In [42]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn import metrics
import sklearn.preprocessing

In [3]:
train_file = "../data/train.csv"
test_file = "../data/test.csv"


train_data_raw = pd.read_csv(train_file)
test_data_raw = pd.read_csv(test_file)


In [4]:
train_data = clean_func(train_data_raw)


In [5]:

### CLEAN DATA FUNC

def clean_func(train_data):
    
    ## DO IMPUTATION 
    # FARE
    imp_fare = Imputer(missing_values="NaN", strategy="mean")
    imp_fare.fit(train_data[["Fare"]])
    train_data[["Fare"]]=imp_fare.transform(train_data[["Fare"]]).ravel() 

    # Age
    imp=Imputer(missing_values="NaN", strategy="mean")
    imp.fit(train_data[["Age"]])
    train_data[["Age"]]=imp.transform(train_data[["Age"]]).ravel() 
    
    # Filna
    train_data["Cabin"] = train_data["Cabin"].fillna("")

    
    # one hot encoding
    sex_features = pd.get_dummies(train_data["Sex"])
    embarked_features = pd.get_dummies(train_data["Embarked"])
    
    # rename embarked features
    embarked_features = embarked_features.rename(columns={'C': 'embarked_cobh'
                                                        , 'Q': 'embark_queenstown'
                                                        , 'S': 'embark_southampton'})

    # Concat new features
    train_data_extras = pd.concat([train_data,sex_features,embarked_features],axis=1)

    
    
    # HACK - REMOVE T WHICH IS NOT IN TEST LIKELY ERRROR 
    cabin_letters = pd.get_dummies(train_data['Cabin'].map(lambda x: "empty" if len(x)==0 or x[0]=="T" else x[0]))

#    cabin_letters = pd.get_dummies(train_data['Cabin'].map(lambda x: "empty" if len(x)==0 else x[0]))
    cabin_letters.columns = ["Cabin_letter_"+i for i in cabin_letters.columns]
    train_data_extras = pd.concat([train_data_extras,cabin_letters],axis=1)
    

    train_data_extras["Cabin_number"] = train_data['Cabin'].map(lambda x: -99 if len(x)==0 else x.split(" ")[0][1:]) 

    return train_data_extras



In [14]:
X_train,X_test,Y_train,Y_test = train_test_split(train_data[numerics]
                              ,train_data[target].values
                              ,test_size=0.3
                              ,random_state=42)

# Models
- logreg
- random forest

In [15]:
log_reg = LogisticRegression(penalty="l2", dual=False, tol=0.0001, C=1.0
                             , fit_intercept=True, intercept_scaling=1
                             , class_weight=None, random_state=None
                             , solver="liblinear", max_iter=100
                             , multi_class="ovr", verbose=0
                             , warm_start=False, n_jobs=1)

log_reg.fit(X_train,Y_train)




LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [16]:
Y_pred = log_reg.predict(X_test)

In [17]:
metrics.accuracy_score(Y_test,Y_pred) 

0.80223880597014929

### random forest naive

In [34]:
model_rf = RandomForestClassifier(
n_estimators=100
)

model_rf.fit(train_data[numerics], train_data[target])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [35]:
# Cross Validation RF

scores = cross_val_score(model_rf, train_data[numerics], train_data[target], cv=10)
print(scores)

[ 0.8         0.78888889  0.76404494  0.83146067  0.91011236  0.83146067
  0.82022472  0.7752809   0.86516854  0.86363636]


In [30]:
pred_rf = model_rf.predict(X_test)
metrics.accuracy_score(Y_test,pred_rf)

0.81716417910447758

### Random Forest Grid Search

In [77]:
model_rf_gs = RandomForestClassifier()



In [94]:
# parmeter dict
param_grid = dict(
    n_estimators=np.arange(60,101,20)
    , min_samples_leaf=np.arange(2,4,1)
    , criterion = ["gini","entropy"]
    , max_features = np.arange(0.1,0.5,0.1)
)
print(param_grid)

{'max_features': array([ 0.1,  0.2,  0.3,  0.4]), 'n_estimators': array([ 60,  80, 100]), 'criterion': ['gini', 'entropy'], 'min_samples_leaf': array([2, 3])}


In [95]:
grid = GridSearchCV(model_rf_gs,param_grid=param_grid,scoring = "accuracy", cv = 5)
grid.fit(train_data[numerics], train_data[target])
""
# model_rf.fit(train_data[numerics], train_data[target])

''

In [103]:
#print(grid)
# for i in ['params',"mean_train_score","mean_test_score"]:
#     print(i)
#     print(grid.cv_results_[i])
#grid.cv_results_

In [102]:
print(grid.best_params_)
print(grid.best_score_)


{'max_features': 0.20000000000000001, 'n_estimators': 100, 'criterion': 'gini', 'min_samples_leaf': 3}
0.828282828283


In [111]:
model_rf_gs = RandomForestClassifier(**grid.best_params_)
model_rf_gs.fit(train_data[numerics],train_data[target])
""
#print(**grid.best_params_)



''

### Get params

In [18]:
# get parameters
coef = list(log_reg.coef_.ravel())
intercept = log_reg.intercept_

# print them
print intercept
for id, i in enumerate(coef):
    print(numerics[id],i)


[ 1.26215252]
('PassengerId', 0.00029769529698762259)
('Pclass', -0.61572589203732686)
('Age', -0.029035069488013012)
('SibSp', -0.25731235170696642)
('Parch', -0.098679879797177933)
('Fare', 0.0046425506773909841)
('female', 1.9122432195029921)
('male', -0.65009070419717296)
('embarked_cobn', 0.70876233250311327)
('embark_queenstown', 0.40782776801440723)
('embark_southampton', 0.076421261776224214)
('Cabin_letter_A', -0.11838698151573311)
('Cabin_letter_B', 0.25698816555962745)
('Cabin_letter_C', -0.56847577394182469)
('Cabin_letter_D', 0.56817426992203868)
('Cabin_letter_E', 1.2341569209168242)
('Cabin_letter_F', 0.75928644755244623)
('Cabin_letter_G', -0.54037042914426503)
('Cabin_letter_empty', -0.32922010404327529)


# PREDICT AND STORE OUTPUT

In [19]:
### HACK TO COMPUTE TEST RESULT
test_data = clean_func(test_data_raw)

#test_data[["Age"]]=imp.transform(test_data[["Age"]]).ravel()

In [20]:
## DO IMPUTATION ON FARE
# imp_fare = Imputer(missing_values="NaN", strategy="mean")
# imp_fare.fit(train_data[["Fare"]])

# test_data[["Fare"]]=imp_fare.transform(test_data[["Fare"]]).ravel() # what is ravel???


In [21]:
test_data.isnull().sum()


PassengerId           0
Pclass                0
Name                  0
Sex                   0
Age                   0
SibSp                 0
Parch                 0
Ticket                0
Fare                  0
Cabin                 0
Embarked              0
female                0
male                  0
embarked_cobn         0
embark_queenstown     0
embark_southampton    0
Cabin_letter_A        0
Cabin_letter_B        0
Cabin_letter_C        0
Cabin_letter_D        0
Cabin_letter_E        0
Cabin_letter_F        0
Cabin_letter_G        0
Cabin_letter_empty    0
Cabin_number          0
dtype: int64

In [22]:
#test_data_y = log_reg.predict(test_data)
test_data_y = log_reg.predict(test_data[numerics])


#train_data[numerics].head(3)

In [23]:
output = pd.DataFrame(zip(list(test_data["PassengerId"]),list(test_data_y)))
output.columns = ["PassengerId","Survived"]

In [24]:
output.to_csv(index=False, path_or_buf= "../data/output.csv")


In [37]:
def output(data,file_name):
    output = pd.DataFrame(zip(list(test_data["PassengerId"]),list(data)))
    output.columns = ["PassengerId","Survived"]
    output.to_csv(index=False, path_or_buf= "../data/{file_name}.csv".format(file_name=file_name))


In [38]:
model_rf_data_y = model_rf.predict(test_data[numerics])

output(model_rf_data_y,"predict_rf_1")


In [112]:
model_rf_gs_data_y =model_rf_gs.predict(test_data[numerics])
output(model_rf_gs_data_y,"predict_rf_gs_1")