# Modelling


### Importing modules

In [214]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer

In [230]:
df_train = pd.read_csv('./data/train.csv')

df_test = pd.read_csv('./data/test.csv')

In [216]:
# data cleaning steps outline in eda.ipynb
def clean(df):
    #df = df[['Survived','Pclass','Sex','Age','SibSp','Parch','Embarked']].copy()
    df = df.drop(['PassengerId','Name','Ticket','Cabin'],axis=1)
    #age field
    df['Age'].fillna(df['Age'].mean(),inplace=True)

    return df
    #df['Sex'] = np.where(df['Sex'] == 'male',1,0)

In [217]:
df_train = clean(df_train)

X = df_train.drop('Survived',axis=1)

y = df_train["Survived"]

pred_data = clean(df_test)

In [218]:
X_train,X_test, y_train,y_test = train_test_split(X,y)

In [219]:
categorical_features = ["Embarked", "Sex"]

ohe = OneHotEncoder()
preprocessor = ColumnTransformer([('cat',ohe,categorical_features)])

In [220]:
log_clt = Pipeline([('preprocessor',preprocessor),
                ('clt',LogisticRegression(penalty=None))])


In [180]:
log_clt.fit(X_train,y_train)

In [190]:
log_clt.score(X_test,y_test)

0.8161434977578476

In [191]:
cross_val_score(log_clt,X,y,scoring='f1_macro')

array([0.79307725, 0.78994572, 0.77123918, 0.72775306, 0.76977536])

In [221]:
log_clt.fit(X,y)

In [226]:
pred = log_clt.predict(pred_data)

In [227]:
output = pd.concat([df_test['PassengerId'],pd.Series(pred,name='Survival')],axis=1)

In [232]:
output.to_csv('./output/prediction.csv',index=False)

In [183]:
penalty_range = [None,'l1','l2','elasticnet']
solver_range = ['lbfgs', 'liblinear', 'newton-cg', 'saga']
fit_intercept = [True,False]

param_grid = {'clt__penalty': penalty_range,'clt__solver': solver_range,'clt__fit_intercept': fit_intercept}

In [184]:
grid = GridSearchCV(log_clt,param_grid,cv=5,verbose=2)

In [185]:
grid.fit(X,y)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END clt__fit_intercept=True, clt__penalty=None, clt__solver=lbfgs; total time=   0.0s
[CV] END clt__fit_intercept=True, clt__penalty=None, clt__solver=lbfgs; total time=   0.0s
[CV] END clt__fit_intercept=True, clt__penalty=None, clt__solver=lbfgs; total time=   0.0s
[CV] END clt__fit_intercept=True, clt__penalty=None, clt__solver=lbfgs; total time=   0.0s
[CV] END clt__fit_intercept=True, clt__penalty=None, clt__solver=lbfgs; total time=   0.0s
[CV] END clt__fit_intercept=True, clt__penalty=None, clt__solver=liblinear; total time=   0.0s
[CV] END clt__fit_intercept=True, clt__penalty=None, clt__solver=liblinear; total time=   0.0s
[CV] END clt__fit_intercept=True, clt__penalty=None, clt__solver=liblinear; total time=   0.0s
[CV] END clt__fit_intercept=True, clt__penalty=None, clt__solver=liblinear; total time=   0.0s
[CV] END clt__fit_intercept=True, clt__penalty=None, clt__solver=liblinear; total time=   0.0s
[CV] END



[CV] END clt__fit_intercept=True, clt__penalty=l1, clt__solver=newton-cg; total time=   0.0s
[CV] END clt__fit_intercept=True, clt__penalty=l1, clt__solver=newton-cg; total time=   0.0s
[CV] END clt__fit_intercept=True, clt__penalty=l1, clt__solver=newton-cg; total time=   0.0s
[CV] END clt__fit_intercept=True, clt__penalty=l1, clt__solver=saga; total time=   0.0s
[CV] END clt__fit_intercept=True, clt__penalty=l1, clt__solver=saga; total time=   0.0s
[CV] END clt__fit_intercept=True, clt__penalty=l1, clt__solver=saga; total time=   0.0s
[CV] END clt__fit_intercept=True, clt__penalty=l1, clt__solver=saga; total time=   0.0s
[CV] END clt__fit_intercept=True, clt__penalty=l1, clt__solver=saga; total time=   0.0s
[CV] END clt__fit_intercept=True, clt__penalty=l2, clt__solver=lbfgs; total time=   0.0s
[CV] END clt__fit_intercept=True, clt__penalty=l2, clt__solver=lbfgs; total time=   0.0s
[CV] END clt__fit_intercept=True, clt__penalty=l2, clt__solver=lbfgs; total time=   0.0s
[CV] END clt__



[CV] END clt__fit_intercept=False, clt__penalty=l1, clt__solver=liblinear; total time=   0.0s
[CV] END clt__fit_intercept=False, clt__penalty=l1, clt__solver=liblinear; total time=   0.0s
[CV] END clt__fit_intercept=False, clt__penalty=l1, clt__solver=newton-cg; total time=   0.0s
[CV] END clt__fit_intercept=False, clt__penalty=l1, clt__solver=newton-cg; total time=   0.0s
[CV] END clt__fit_intercept=False, clt__penalty=l1, clt__solver=newton-cg; total time=   0.0s
[CV] END clt__fit_intercept=False, clt__penalty=l1, clt__solver=newton-cg; total time=   0.0s
[CV] END clt__fit_intercept=False, clt__penalty=l1, clt__solver=newton-cg; total time=   0.0s
[CV] END clt__fit_intercept=False, clt__penalty=l1, clt__solver=saga; total time=   0.0s
[CV] END clt__fit_intercept=False, clt__penalty=l1, clt__solver=saga; total time=   0.0s
[CV] END clt__fit_intercept=False, clt__penalty=l1, clt__solver=saga; total time=   0.0s
[CV] END clt__fit_intercept=False, clt__penalty=l1, clt__solver=saga; total

70 fits failed out of a total of 160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/home/ds/miniconda3/envs/ds/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/ds/miniconda3/envs/ds/lib/python3.10/site-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/ds/miniconda3/envs/ds/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1216, in fit
    self.coef_, self.intercept_, self.n_iter_ = _fit_liblinear(
  File "/home/ds/miniconda3/envs/ds/lib/python3.10/si

In [178]:
grid.best_estimator_