In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn import metrics
import sklearn.preprocessing

In [12]:
train_file = "../data/train.csv"
test_file = "../data/test.csv"


train_data_raw = pd.read_csv(train_file)
test_data_raw = pd.read_csv(test_file)

target = "Survived"


In [61]:

### CLEAN DATA FUNC

def clean_func(train_data):
    
    ## DO IMPUTATION 
    # FARE
    imp_fare = Imputer(missing_values="NaN", strategy="mean")
    imp_fare.fit(train_data[["Fare"]])
    train_data[["Fare"]]=imp_fare.transform(train_data[["Fare"]]).ravel() 

    # Age
    imp=Imputer(missing_values="NaN", strategy="mean")
    imp.fit(train_data[["Age"]])
    train_data[["Age"]]=imp.transform(train_data[["Age"]]).ravel() 
    
    # Filna
    train_data["Cabin"] = train_data["Cabin"].fillna("")

    
    # one hot encoding
    sex_features = pd.get_dummies(train_data["Sex"])
    embarked_features = pd.get_dummies(train_data["Embarked"])
    
    # rename embarked features
    embarked_features = embarked_features.rename(columns={'C': 'embarked_cobh'
                                                        , 'Q': 'embark_queenstown'
                                                        , 'S': 'embark_southampton'})

    # Concat new features
    train_data_extras = pd.concat([train_data,sex_features,embarked_features],axis=1)

    
    
    # HACK - REMOVE T WHICH IS NOT IN TEST LIKELY ERRROR 
    cabin_letters = pd.get_dummies(train_data['Cabin'].map(lambda x: "empty" if len(x)==0 or x[0]=="T" else x[0]))

#    cabin_letters = pd.get_dummies(train_data['Cabin'].map(lambda x: "empty" if len(x)==0 else x[0]))
    cabin_letters.columns = ["Cabin_letter_"+i for i in cabin_letters.columns]
    train_data_extras = pd.concat([train_data_extras,cabin_letters],axis=1)
    

    train_data_extras["Cabin_number"] = train_data['Cabin'].map(lambda x: -99 if len(x)==0 else x.split(" ")[0][1:]) 

    # ONLY RETURN NUMERIC COLUMNS 
    num_types = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64','uint8']
    train_data_numerics = train_data_extras.select_dtypes(include=num_types)

    return train_data_numerics


## Select only numeric columns

In [97]:
train_data_raw = clean_func(train_data_raw)
train_data = train_data.ix[:, train_data_raw.columns != target]
train_data_target = train_data_raw[target].values


In [98]:
X_train,X_test,Y_train,Y_test = train_test_split(train_data
                              ,train_data_target
                              ,test_size=0.3
                              ,random_state=42)

# Models
- logreg
- random forest

In [74]:
log_reg = LogisticRegression(penalty="l2", dual=False, tol=0.0001, C=1.0
                             , fit_intercept=True, intercept_scaling=1
                             , class_weight=None, random_state=None
                             , solver="liblinear", max_iter=100
                             , multi_class="ovr", verbose=0
                             , warm_start=False, n_jobs=1)

log_reg.fit(X_train,Y_train)




LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [75]:
Y_pred = log_reg.predict(X_test)

In [76]:
metrics.accuracy_score(Y_test,Y_pred) 

0.80223880597014929

### random forest naive

In [101]:
model_rf = RandomForestClassifier(
n_estimators=100
)

model_rf.fit(train_data, train_data_target)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [104]:
# Cross Validation RF

scores = cross_val_score(model_rf, train_data, train_data_target, cv=10)
print(scores)

[ 0.8         0.8         0.76404494  0.83146067  0.91011236  0.83146067
  0.82022472  0.7752809   0.86516854  0.85227273]


In [105]:
pred_rf = model_rf.predict(X_test)
metrics.accuracy_score(Y_test,pred_rf)

1.0

### Random Forest Grid Search

In [106]:
model_rf_gs = RandomForestClassifier()



In [107]:
# parmeter dict
param_grid = dict(
    n_estimators=np.arange(60,101,20)
    , min_samples_leaf=np.arange(2,4,1)
    #, criterion = ["gini","entropy"]
    #, max_features = np.arange(0.1,0.5,0.1)
)
print(param_grid)

{'n_estimators': array([ 60,  80, 100]), 'min_samples_leaf': array([2, 3])}


In [None]:
grid = GridSearchCV(model_rf_gs,param_grid=param_grid,scoring = "accuracy", cv = 5)
grid.fit(train_data, train_data_target)
""

# model_rf.fit(train_data, train_data[target])

In [109]:
#print(grid)
# for i in ['params',"mean_train_score","mean_test_score"]:
#     print(i)
#     print(grid.cv_results_[i])
#grid.cv_results_

In [None]:
print(grid.best_params_)
print(grid.best_score_)


In [80]:
model_rf_gs = RandomForestClassifier(**grid.best_params_)
model_rf_gs.fit(train_data,train_data[target])
""
#print(**grid.best_params_)



''

### Get params

In [81]:
# get parameters
coef = list(log_reg.coef_.ravel())
intercept = log_reg.intercept_

# print them
print intercept

coef
for id, i in enumerate(coef):
    print(train_data.columns[id],i)


[ 1.26214381]
('PassengerId', 0.00029820095499969796)
('Survived', -0.61572194000855129)
('Pclass', -0.029047527452569548)
('Age', -0.25731856900057948)
('SibSp', -0.098679125355903702)
('Parch', 0.0046448498810989286)
('Fare', 1.9122465908765478)
('female', -0.65010278524410203)
('male', 0.70876240073603169)
('embarked_cobh', 0.40782184168309971)
('embark_queenstown', 0.076421717282556312)
('embark_southampton', -0.11838528129807893)
('Cabin_letter_A', 0.25698381555541661)
('Cabin_letter_B', -0.56847249410849043)
('Cabin_letter_C', 0.56816881665557617)
('Cabin_letter_D', 1.2341479281746726)
('Cabin_letter_E', 0.75927127194689126)
('Cabin_letter_F', -0.5403500103496528)
('Cabin_letter_G', -0.32922024094390645)


# PREDICT AND STORE OUTPUT

In [82]:
### HACK TO COMPUTE TEST RESULT
test_data = clean_func(test_data_raw)

#test_data[["Age"]]=imp.transform(test_data[["Age"]]).ravel()

In [83]:
## DO IMPUTATION ON FARE
# imp_fare = Imputer(missing_values="NaN", strategy="mean")
# imp_fare.fit(train_data[["Fare"]])

# test_data[["Fare"]]=imp_fare.transform(test_data[["Fare"]]).ravel() # what is ravel???


In [84]:
test_data.isnull().sum()


PassengerId           0
Pclass                0
Age                   0
SibSp                 0
Parch                 0
Fare                  0
female                0
male                  0
embarked_cobh         0
embark_queenstown     0
embark_southampton    0
Cabin_letter_A        0
Cabin_letter_B        0
Cabin_letter_C        0
Cabin_letter_D        0
Cabin_letter_E        0
Cabin_letter_F        0
Cabin_letter_G        0
Cabin_letter_empty    0
dtype: int64

In [85]:
#test_data_y = log_reg.predict(test_data)
test_data_y = log_reg.predict(test_data)


#train_data.head(3)

In [86]:
output = pd.DataFrame(zip(list(test_data["PassengerId"]),list(test_data_y)))
output.columns = ["PassengerId","Survived"]

In [87]:
output.to_csv(index=False, path_or_buf= "../data/output.csv")


In [88]:
def output(data,file_name):
    output = pd.DataFrame(zip(list(test_data["PassengerId"]),list(data)))
    output.columns = ["PassengerId","Survived"]
    output.to_csv(index=False, path_or_buf= "../data/{file_name}.csv".format(file_name=file_name))

    


In [96]:
a =list(test_data.columns)
b = list(train_data.columns)

[item for item in b if item not in a]



['Survived']

In [91]:
model_rf_data_y = model_rf.predict(test_data)

output(model_rf_data_y,"predict_rf_1")


ValueError: Number of features of the model must match the input. Model n_features is 20 and input n_features is 19 

In [None]:
model_rf_gs_data_y =model_rf_gs.predict(test_data[numerics])
output(model_rf_gs_data_y,"predict_rf_gs_1")