In [61]:
from sklearn import  datasets, linear_model, metrics, model_selection
from sklearn.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV
import numpy as np
import pandas as pd

In [62]:
titanic = pd.read_csv('titanic_train.csv' )

In [63]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [64]:
idx_col = 'PassengerId'
target_col = 'Survived'
cat_cols = ['Name', 'Cabin', 'Embarked', 'Ticket']
features_cols = list(titanic.columns)
features_cols.remove(idx_col)
features_cols.remove(target_col)
[features_cols.remove(i) for i in cat_cols]

titanic['Sex'] = titanic['Sex'].apply(lambda x: 1 if 'male' else 0)
print(features_cols)

['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']


In [65]:
titanic[features_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  891 non-null    int64  
 1   Sex     891 non-null    int64  
 2   Age     714 non-null    float64
 3   SibSp   891 non-null    int64  
 4   Parch   891 non-null    int64  
 5   Fare    891 non-null    float64
dtypes: float64(2), int64(4)
memory usage: 41.9 KB


In [103]:
mean_age = titanic['Age'].mean()
mean_fare = titanic['Fare'].mean()

In [66]:
titanic['Age'].fillna(titanic['Age'].mean(), inplace=True)

In [77]:
train_data = titanic[features_cols]
train_labels = titanic[target_col]

In [67]:
#train_data, test_data, train_labels, test_labels = model_selection.train_test_split(titanic[features_cols], titanic[target_col], 
#                                                                                     test_size = 0.3,random_state = 0)

In [78]:
classifier = linear_model.SGDClassifier(random_state = 0)

In [79]:
classifier.get_params().keys()

dict_keys(['alpha', 'average', 'class_weight', 'early_stopping', 'epsilon', 'eta0', 'fit_intercept', 'l1_ratio', 'learning_rate', 'loss', 'max_iter', 'n_iter_no_change', 'n_jobs', 'penalty', 'power_t', 'random_state', 'shuffle', 'tol', 'validation_fraction', 'verbose', 'warm_start'])

In [80]:
parameters_grid = {
    'loss' : ['hinge', 'log', 'squared_hinge', 'squared_loss'],
    'penalty' : ['l1', 'l2'],
    'max_iter' : range(5,10),
    'alpha' : np.linspace(0.0001, 0.001, num = 5),
}
cv = model_selection.StratifiedShuffleSplit(n_splits = 10, test_size = 0.2, random_state = 0).split(train_data, train_labels)
grid_cv = GridSearchCV(classifier, parameters_grid, scoring = 'accuracy', cv = cv)

In [81]:
%%time
grid_cv.fit(train_data, train_labels)



















































CPU times: user 4.99 s, sys: 176 ms, total: 5.16 s
Wall time: 5.11 s




GridSearchCV(cv=<generator object BaseShuffleSplit.split at 0x7f8dcb228f20>,
             estimator=SGDClassifier(random_state=0),
             param_grid={'alpha': array([0.0001  , 0.000325, 0.00055 , 0.000775, 0.001   ]),
                         'loss': ['hinge', 'log', 'squared_hinge',
                                  'squared_loss'],
                         'max_iter': range(5, 10), 'penalty': ['l1', 'l2']},
             scoring='accuracy')

In [82]:
grid_cv.best_estimator_

SGDClassifier(loss='log', max_iter=7, penalty='l1', random_state=0)

In [83]:
print( grid_cv.best_score_)
print( grid_cv.best_params_)

0.6765363128491619
{'alpha': 0.0001, 'loss': 'log', 'max_iter': 7, 'penalty': 'l1'}


In [97]:
titanic_test = pd.read_csv('titanic_test.csv')

In [98]:
titanic_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [104]:
titanic_test['Sex'] = titanic_test['Sex'].apply(lambda x: 1 if 'male' else 0)
titanic_test['Age'].fillna(mean_age, inplace=True)
titanic_test['Fare'].fillna(mean_fare, inplace=True)

test_data = titanic_test[features_cols]


In [105]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  418 non-null    int64  
 1   Sex     418 non-null    int64  
 2   Age     418 non-null    float64
 3   SibSp   418 non-null    int64  
 4   Parch   418 non-null    int64  
 5   Fare    418 non-null    float64
dtypes: float64(2), int64(4)
memory usage: 19.7 KB


In [106]:
predictions = grid_cv.predict(test_data)

output = pd.DataFrame({'PassengerId': titanic_test.PassengerId, 'Survived': predictions})
output.to_csv('kaggle_grid_search.csv', index=False)

# Randomized Grid Search

In [107]:
cv = model_selection.StratifiedShuffleSplit(n_splits = 10, test_size = 0.2, random_state = 0).split(train_data, train_labels)

In [108]:
randomized_grid_cv = RandomizedSearchCV(classifier, parameters_grid, scoring = 'accuracy', cv = cv, n_iter = 20, 
                                                   random_state = 0)

In [109]:
%%time
randomized_grid_cv.fit(train_data, train_labels)





CPU times: user 499 ms, sys: 19.7 ms, total: 518 ms
Wall time: 514 ms




RandomizedSearchCV(cv=<generator object BaseShuffleSplit.split at 0x7f8deb764040>,
                   estimator=SGDClassifier(random_state=0), n_iter=20,
                   param_distributions={'alpha': array([0.0001  , 0.000325, 0.00055 , 0.000775, 0.001   ]),
                                        'loss': ['hinge', 'log',
                                                 'squared_hinge',
                                                 'squared_loss'],
                                        'max_iter': range(5, 10),
                                        'penalty': ['l1', 'l2']},
                   random_state=0, scoring='accuracy')

In [110]:
print( randomized_grid_cv.best_score_)
print( randomized_grid_cv.best_params_)

0.6558659217877095
{'penalty': 'l2', 'max_iter': 7, 'loss': 'hinge', 'alpha': 0.0007750000000000001}


In [111]:
predictions = randomized_grid_cv.predict(test_data)

output = pd.DataFrame({'PassengerId': titanic_test.PassengerId, 'Survived': predictions})
output.to_csv('kaggle_randomized_grid_cv.csv', index=False)