### Importing libraries

In [None]:
import pandas as pd
import numpy as np
import missingno
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")


### Reading data

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('gender_submission.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
submission.head()

### 3.  Data Dictionary:*
Survival: 0 = No, 1 = Ye
*  pclass: Ticket class 1 = 1st, 2 = 2nd  3 = r
* rs sibsp: # of siblings / spouses aboard the Titan
* ic parch: # of parents / children aboard the Titan
* ic ticket: Ticket numb
* er fare: Passenger fa
* re cabin: Cabin num
* Age: Age in years 
* sex: Sex
* er embarked: Port of Embarkation C = Cherbourg, Q = Queenstown, S = Southampton

### 4. Exploratory Data Analysis (EDA)

* .info()
* isnull().sum().sort_values()
* missingno.matrix()
* describe()

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
test.info()

In [None]:
missingno.matrix(train)

In [None]:
missingno.matrix(test)

In [None]:
train.isnull().sum().sort_values(ascending=False)

In [None]:
test.isnull().sum().sort_values(ascending=False)

In [None]:
train.query('Embarked == "Q"')

#### 4.1 Feature Analysis

* PassengerId - index
* Survived - target feature

##### Categorical features:
* Pclass - cat
* Name - cat
* Sex - cat
* Cabin - cat
* Embark - cat

##### Numerical features
* Age - num
* SibSp - num
* Parch - num
* Ticket - num
* Fare - num

#### 4.2 Categorical features
* 4.2.1 Pclass - cat
* 4.2.2 Name - cat
* 4.2.3 Sex - cat
* 4.2.4 Cabin - cat
* 4.2.5 Embark - cat
* 4.3.5 Ticket - cat

##### 4.2.1 Pclass

In [None]:
train.Pclass.value_counts()

In [None]:
train[['Parch','SibSp','Age','Pclass','Survived']].groupby('Pclass').mean().sort_values('Survived', ascending=False)

In [None]:
Pclass_sur_mean = train.groupby('Pclass')['Survived'].mean()
Pclass_sur_mean

In [None]:
Pclass_sur_mean.plot(marker='o', linestyle='-', color='c')
plt.title('Pclass vs Survived Mean')
plt.xlabel('Pclass')
plt.ylabel('Survived Mean')

plt.show()

In [None]:
sns.catplot(data=train, x='Pclass', y='Survived', hue='Sex', kind='bar')

In [None]:
train[['Pclass', 'Sex']].groupby(['Pclass', 'Sex']).size().unstack(fill_value=0)

In [None]:
men = train.query('Sex == "male"')['Survived']
men_ratio = sum(men) / len(men)
men_ratio

In [None]:
sns.catplot(data=train, x='Pclass', col='Survived', hue='Sex', kind='count')

##### 4.2.2 Name

In [None]:
train.Name.isnull().sum()

In [None]:
train.Name.nunique()

Names wont tell us anything at this stage, but we clearly see that there are titles in each name like: "Mr" and "Miss" or "Master". I think this will be usefull for replacing Nan values in age column, as well as it might be interesting feature to engineer in feature engineering part of this project.

##### 4.2.3 Sex

In [None]:
train.Sex.isnull().sum()

In [None]:
train.Sex.value_counts()

In [None]:
survival_by_sex = train.groupby('Sex')['Survived'].mean()
survival_by_sex

In [None]:
sns.barplot(data=train, x='Sex', y='Survived')

In [None]:
sns.catplot(data=train, x='Sex', y='Survived', hue='Pclass', kind='bar')

We can clearly see that overall women had highest survival rate, especially those from first and second class. We can assume that when we fill Age NaN's we can get even better picture with enginnered feature "children" that we will separate from both male and female features.

##### 4.2.4 Cabin

In [None]:
train['Cabin'] = train['Cabin'].astype(str)
test['Cabin'] = test['Cabin'].astype(str)

In [None]:
train.query('Cabin.notnull()').groupby('Cabin')['Survived'].mean()

In Feature engineering part I will try to use cabin feature to vreate a new one.

##### 4.2.5 Embarked

In [None]:
train.Embarked.value_counts()

In [None]:
train.Embarked.isnull().sum()

In [None]:
train.groupby('Embarked')['Survived'].mean()

In [None]:
sns.catplot(data=train, x='Embarked', col='Survived', hue='Pclass' , kind='count' )

##### 4.3.5 Ticket

##### 4.3 Numerical features.
* 4.3.1 Detecting and removing outliers
* 4.3.2 Age - num
* 4.3.3 SibSp - num
* 4.3.4 Parch - num
* 4.3.5 Fare - num

##### 4.3.1 Detecting and removing outliers.

In [None]:
train.head()

In [None]:
def detect_outliers(df, n, features):
    outlier_indices = []
    for col in features:
        Q1 = np.percentile(df[col], 25)
        Q3 = np.percentile(df[col], 75)
        IQR = Q3 - Q1
        outlier_step = 1.5 * IQR
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step)].index
        outlier_indices.extend(outlier_list_col)
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(key for key, value in outlier_indices.items() if value > n)
    return multiple_outliers

outliers = detect_outliers(train, 2, ['Age', 'SibSp', 'Parch', 'Fare'])

In [None]:
outliers

In [None]:
print(len(train))
train = train.drop(outliers, axis=0).reset_index(drop=True)
len(train)

In [None]:
train.query('Pclass == 1 & Fare > 150').sort_values('Fare', ascending=False)

##### 4.3.2 Age.

In [None]:
train.Age.isnull().sum()

In [None]:
sns.displot(data=train, x='Age', kde=True, label='Skewness: %.2f'%(train['Age'].skew()))
plt.legend(loc='best')

In [None]:
train.groupby('Age')['Survived'].mean()

In [None]:
g = sns.FacetGrid(train, col='Survived')

g.map(sns.distplot, 'Age')

##### 4.3.3 SibSp.

In [None]:
train.SibSp.value_counts(dropna=False)

In [None]:
train.SibSp.isnull().sum()

In [None]:
train.groupby('SibSp')['Survived'].mean()

In [None]:
sns.barplot(data=train, x='SibSp', y='Survived')
plt.show()

#### 4.3.4 Parch.

In [None]:
train.Parch.value_counts(dropna=False)

In [None]:
train.Parch.isnull().sum()

In [None]:
train.groupby('Parch')['Survived'].mean()

In [None]:
sns.barplot(data=train, x='Parch', y='Survived')
plt.show()

In [None]:
sns.catplot(data=train, x='Parch', col='Survived', hue='Pclass', kind='count')

##### 4.3.5 Fare

In [None]:
sns.displot(data=train, x='Fare', kde='True', label='Skewness: %.2f'%(train['Fare'].skew()))
plt.legend(loc='best')

### 5. Data preprocesing
Data preprocesing is the process of getting our dataset ready for model training.
In this section we will perform the following preprocessing steps:* 5.1

* Drop and fill missing va in train and test datasets.
*  5.1.1 concat datasets train and test.
*      5.1.1.1 Embarked.
*      5.1.1.2 Age.ati 5.1.2 on
* Feature engin.
*      5.1.2 "Title"eering
* Feature e
* Data transformationncoding

##### 5.1 Drop and fill missing values in train and test datasets.
*  5.1.1 Train dataset.


##### 5.1.1.1 Embarked.

In [None]:
mode = train.Embarked.dropna().mode()[0]
mode

In [None]:
train['Embarked'] = train.Embarked.fillna(mode)

In [None]:
median = test.Fare.dropna().median()
median

In [None]:
test.Fare = test.Fare.fillna(median)

In [None]:
combine = pd.concat([train,test], axis=0).reset_index(drop=True)

In [None]:
combine['Sex'] = combine['Sex'].map({'male': 0, 'female': 1})


In [None]:
combine.head()

In [None]:
sns.heatmap(combine.drop(['Survived', 'Name', 'PassengerId','Ticket', 'Cabin', 'Fare', 'Embarked'], axis=1).corr(), annot=True, fmt='.2f', cmap='coolwarm')

##### 5.1.1.2 Age.

In [None]:
combine['Title'] = [title.split(',')[1].split('.')[0].strip() for title in combine.Name]

In [None]:
combine.Title.value_counts()

In [None]:
nan_idx = combine.query('Age.isnull()').index
nan_idx

In [None]:
sns.catplot(data=combine, x='Age', y='Title',col='Pclass', kind='bar')

In [None]:
# no_people = []
# for idx in nan_idx:
#     age_val = train['Age'][(train['Title'] == train.loc[idx, 'Title'])
#                             & (train['Pclass'] == train.loc[idx,'Pclass'])
#                             & (train['SibSp'] == train.loc[idx,'SibSp'])
#                             & (train['Parch'] == train.loc[idx,'Parch'])]
    
#     plt.bar(range(len(age_val)), age_val)
#     plt.title(f'Ages for Index {idx}')
#     plt.xlabel('Person Index')
#     plt.ylabel('Age')
#     plt.show()
#     no_people.append(age_val)



In [None]:
for idx in nan_idx:
    median_age = combine['Age'][(combine['Title'] == combine.loc[idx, 'Title'])].median()
    sibsp_val = combine.loc[idx, 'SibSp']
    pclass_val = combine.loc[idx, 'Pclass']
    predicted_age = combine['Age'][ (combine['Title'] == combine.loc[idx, 'Title'])
                                & (combine['Pclass'] == pclass_val)
                                & (combine['SibSp'] == sibsp_val)
                                & (combine['Parch'] == combine.loc[idx,'Parch'])].median()
    if np.isnan(predicted_age):
        combine.loc[idx,'Age'] = median_age
        
    else:
        combine.loc[idx,'Age'] = predicted_age

    

In [None]:
combine.Age.isnull().sum()

In [None]:
sns.displot(data=combine, x='Age', kde=True, label='Skewness: %.2f'%(train['Age'].skew()))
plt.legend(loc='best')

#####  5.1.2 Feature engineering.
*      5.1.2 "Title" - Alredy created for filling NaN values in Age feature.

In [None]:
combine['Title'] = combine['Title'].replace(['Mlle', 'Ms'], 'Miss')
combine['Title'] = combine['Title'].replace('Mme', 'Mrs')

In [None]:
combine['Title'] = combine['Title'].replace(['Don','Capt','Jonkheer'], 'Mr')
combine['Title'] = combine['Title'].replace(['Dona','Lady'], 'Mrs')
combine['Title'] = combine['Title'].replace(['the Countess','Sir','Major','Col'], 'Rare')

In [None]:
combine.Title.value_counts()

In [None]:
combine[['Title','Survived']].groupby('Title', as_index=False).mean().sort_values('Survived', ascending=False)

In [None]:
combine.query('Cabin != "nan"  & Pclass == 3')
combine['Cabin_letter'] = combine['Cabin'].str.extract(r'([A-Za-z])')
combine['Cabin_letter'].value_counts()


In [None]:
combine.groupby('Cabin_letter')['Survived'].mean().sort_values(ascending=False)


In [None]:
combine[['Cabin_letter', 'Sex']].groupby(['Cabin_letter', 'Sex']).size().unstack(fill_value=0)



In [None]:
combine.groupby(['Cabin_letter', 'Pclass'])['Fare'].mean().sort_values(ascending=False)

In [None]:
sns.catplot(data=combine, x='Cabin_letter', y='Survived', hue='Sex', kind='bar')

In [None]:
combine = combine.drop(['PassengerId', 'Name', 'Cabin','Ticket'],axis=1)

In [None]:
sns.catplot(data=train, x='Sex', y='Age', col='Survived', hue='Pclass', kind='box')

In [None]:
combine['Age'] = combine['Age'].astype(int)
combine['Pclass'] = combine['Pclass'].astype(int)

In [None]:
combine['Age/Pclass'] = combine['Age'] / combine['Pclass']

In [None]:
combine.head()

In [None]:
combine['Family_size'] = combine.SibSp + combine.Parch + 1


In [None]:
combine['Family_size'].value_counts()

In [None]:
combine['Is_Alone'] = 0
combine.loc[combine['Family_size']==1, 'Is_Alone'] = 1
combine = combine.drop(['SibSp', 'Parch', 'Family_size'],axis=1)

##### 6. Modeling.
* 6.1 Encoding categorical values.
* 6.2 Scaling numerical values.
* 6.3 Applying ML models with evaluation (score, f1, precision, recall).
    * Logistic Regression.
    * SVC Support Vector Machines.
    * Linear SVC.
    * SGD - Stochastic Gradient Decent.
    * K- Nearest Neigbours.
    * GNB - Gaussian Naive Base.
    * Perceptron.
    * DTC - Decision Tree CLassifier.
    * RF - Random Forest (ensable method).
    * Catboost.
    * Adaboost.
* 6.4 Compering all models & chosing the best performing one.
* 6.5 One more look at features, engineered features, Ticket, trying new features and reload models.
* 6.6 Creating a pipeline with selected models, applying PCA, Select-K features, hyperparameter tuning and cross validation.

##### 6.1 Encoding categorical values.

In [None]:
combine = pd.get_dummies(combine, columns=['Title'], dtype=int)
combine = pd.get_dummies(combine, columns=['Embarked'], prefix='Em', dtype=int)
combine = pd.get_dummies(combine, columns=['Cabin_letter'],prefix='Cl',dtype=int)

In [None]:
combine.head()

##### 6.2 Scaling numerical values.

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

In [None]:
features_selected = combine[['Age', 'Fare','Age/Pclass']]

In [None]:
combine[['Age', 'Fare', 'Age/Pclass']] = scaler.fit_transform(features_selected)

In [None]:
combine.head()

In [None]:
train = combine[:len(train)]
test = combine[len(train):]

In [None]:
train.shape

In [None]:
test.shape

In [None]:
features = train.drop('Survived', axis=1)
labels = train['Survived']
s_test = test

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42)

##### 6.3 Applying ML models with evaluation (score, f1, precision, recall).

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, recall_score, precision_score
PERF_FORMAT_STRING =" \tAccuracy: {:>0.{round}f}\tF1_score: {:>0.{round}f}\tPrecision: {:>0.{round}f}\tRecall: {:>0.{round}f} "

In [None]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
lr_pred = lr_clf.predict(X_test)
lr_acc = lr_clf.score(X_train, y_train)
lr_f1_score = f1_score(y_test, lr_pred)
lr_precision = precision_score(y_test, lr_pred)
lr_recall = recall_score(y_test, lr_pred)
print(PERF_FORMAT_STRING.format(lr_acc, lr_f1_score, lr_precision, lr_recall, round = 4))


In [None]:
from sklearn.svm import SVC
svc_clf = SVC()
svc_clf.fit(X_train, y_train)
svc_pred = svc_clf.predict(X_test)
svc_acc = svc_clf.score(X_train, y_train)
svc_f1_score = f1_score(y_test, svc_pred)
svc_precision = precision_score(y_test, svc_pred)
svc_recall = recall_score(y_test, svc_pred)
print(PERF_FORMAT_STRING.format(svc_acc, svc_f1_score, svc_precision, svc_recall, round = 4))

In [None]:
from sklearn.svm import LinearSVC
lsvc_clf = LinearSVC()
lsvc_clf.fit(X_train, y_train)
lsvc_pred = lsvc_clf.predict(X_test)
lsvc_acc = lsvc_clf.score(X_train, y_train)
lsvc_f1_score = f1_score(y_test, lsvc_pred)
lsvc_precision = precision_score(y_test, lsvc_pred)
lsvc_recall =recall_score(y_test, lsvc_pred)
print(PERF_FORMAT_STRING.format(lsvc_acc, lsvc_f1_score, lsvc_precision, lsvc_recall, round = 4))

In [None]:
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier()
sgd_clf.fit(X_train, y_train)
sgd_pred = lsvc_clf.predict(X_test)
sgd_acc = lsvc_clf.score(X_train, y_train)
sgd_f1_score = f1_score(y_test, sgd_pred)
sgd_precision = precision_score(y_test, sgd_pred)
sgd_recall =recall_score(y_test, sgd_pred)
print(PERF_FORMAT_STRING.format(sgd_acc, sgd_f1_score, sgd_precision, sgd_recall, round = 4))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors = 5)
knn_clf.fit(X_train, y_train)
knn_pred = knn_clf.predict(X_test)
knn_acc = knn_clf.score(X_train, y_train)
knn_f1_score = f1_score(y_test, knn_pred)
knn_precision = precision_score(y_test, knn_pred)
knn_recall =recall_score(y_test, knn_pred)
print(PERF_FORMAT_STRING.format(knn_acc, knn_f1_score, knn_precision, knn_recall, round = 4))

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb_clf = GaussianNB()
gnb_clf.fit(X_train, y_train)
gnb_pred = gnb_clf.predict(X_test)
gnb_acc = gnb_clf.score(X_train, y_train)
gnb_f1_score = f1_score(y_test, gnb_pred)
gnb_precision = precision_score(y_test, gnb_pred)
gnb_recall = recall_score(y_test, gnb_pred)
print(PERF_FORMAT_STRING.format(gnb_acc, gnb_f1_score, gnb_precision, gnb_recall, round = 4))

In [None]:
from sklearn.linear_model import Perceptron

p_clf = Perceptron()
p_clf.fit(X_train, y_train)
p_pred = p_clf.predict(X_test)
p_acc = gnb_clf.score(X_train, y_train)
p_f1_score = f1_score(y_test, p_pred)
p_precision = precision_score(y_test, p_pred)
p_recall = recall_score(y_test, p_pred)
print(PERF_FORMAT_STRING.format(p_acc, p_f1_score, p_precision, p_recall, round = 4))

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)
dt_acc = gnb_clf.score(X_train, y_train)
dt_f1_score = f1_score(y_test, dt_pred)
dt_precision = precision_score(y_test, dt_pred)
dt_recall = recall_score(y_test, dt_pred)
print(PERF_FORMAT_STRING.format(dt_acc, dt_f1_score, dt_precision, dt_recall, round = 4))

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)
rf_pred = dt_clf.predict(X_test)
rf_acc = gnb_clf.score(X_train, y_train)
rf_f1_score = f1_score(y_test, rf_pred)
rf_precision = precision_score(y_test, rf_pred)
rf_recall = recall_score(y_test, rf_pred)
print(PERF_FORMAT_STRING.format(rf_acc, rf_f1_score, rf_precision, rf_recall, round = 4))

In [None]:
from catboost import CatBoostClassifier
cb_clf = CatBoostClassifier()
cb_clf.fit(X_train, y_train)
cb_pred = dt_clf.predict(X_test)
cb_acc = gnb_clf.score(X_train, y_train)
cb_f1_score = f1_score(y_test, cb_pred)
cb_precision = precision_score(y_test, cb_pred)
cb_recall = recall_score(y_test, cb_pred)
print(PERF_FORMAT_STRING.format(cb_acc, cb_f1_score, cb_precision, cb_recall, round = 4))

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ab_clf = AdaBoostClassifier()
ab_clf.fit(X_train, y_train)
ab_pred = dt_clf.predict(X_test)
ab_acc = gnb_clf.score(X_train, y_train)
ab_f1_score = f1_score(y_test, ab_pred)
ab_precision = precision_score(y_test, ab_pred)
ab_recall = recall_score(y_test, ab_pred)
print(PERF_FORMAT_STRING.format(ab_acc, ab_f1_score, ab_precision, ab_recall, round = 4))

In [None]:
models = pd.DataFrame({'Model': ['Logistic Regression', 'SVC Support Vector Machines', 'Linear SVC', 'SGD - Stochastic Gradient Decent',
                       'K- Nearest Neigbours', 'GNB - Gaussian Naive Base', 'Perceptron', 'DTC - Decision Tree CLassifier',
                       'RF - Random Forest', 'Catboost', 'Adaboost'],
                      'Accuracy': [lr_acc, svc_acc, lsvc_acc, sgd_acc, knn_acc, gnb_acc, p_acc, dt_acc, rf_acc, cb_acc, ab_acc],
                      'F1':[ lr_f1_score, svc_f1_score, lsvc_f1_score, sgd_f1_score, knn_f1_score, gnb_f1_score, p_f1_score, dt_f1_score, rf_f1_score,
                       cb_f1_score, ab_f1_score],
                      'Precision': [lr_precision, svc_precision, lsvc_precision, sgd_precision, knn_precision, gnb_precision, p_precision, 
                       dt_precision, rf_precision, cb_precision, ab_precision],
                      'Recall':[ lr_recall, svc_recall, lsvc_recall, sgd_recall, knn_recall, gnb_recall, p_recall, dt_recall, rf_recall,
                       cb_recall, ab_recall ]}
                     )
models.sort_values('F1', ascending=False)

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
PERF_FORMAT_STRING ="\tAccuracy: {:>0.{round}f}\tPrecison: {:>0.{round}f}\tRecall: {:>0.{round}f}\tF1: {:>0.{round}f}"
def test_classifier(clf, dataset, feature_list, folds = 1000):   
    labels = dataset['Survived']
    features = dataset.drop('Survived', axis=1)
    cv = StratifiedShuffleSplit(n_splits=folds, test_size=0.3, random_state=42)
    true_positive = 0
    true_negative = 0
    false_positive = 0
    false_negative = 0
    for train_idx, test_idx in cv.split(features, labels):
        train_features = features.iloc[train_idx]
        train_labels = labels.iloc[train_idx]
        test_features = features.iloc[test_idx]
        test_labels = labels.iloc[test_idx]

        clf.fit(train_features, train_labels)
        predictions = clf.predict(test_features)

        for prediction, truth in zip(predictions, test_labels):
            if prediction == 1 and truth == 1:
                true_positive+=1
            elif prediction == 0 and truth == 0:
                true_negative+=1
            elif prediction == 1 and truth == 0:
                false_positive+=1
            elif prediction == 0 and truth == 1:
                false_negative+=1

    try:
        all_predictions = true_positive + true_negative + false_positive + false_negative
        accuracy = (true_positive + true_negative) / all_predictions
        precision = true_positive / (true_positive + false_positive)
        recall = true_positive / (true_positive + false_negative)
        f1 = 2.0 * (precision * recall)  / (precision + recall)
        print(PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, round=4))

    except ZeroDivisionError:
        print("Warning: Got a divide by zero when trying out:", clf)
              
        

### Trying improve our best model by applying PCA, Select-KFeatures techniques

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=8)
X_train_pca = pca.fit_transform(X_train)
print(pca.explained_variance_ratio_)
pca_components = pca.components_
first_pca = pca.components_[0]



In [None]:
features.shape

In [None]:
features_list = features.columns

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
kbest = SelectKBest(f_classif, k=10)
kbest.fit_transform(X_train, y_train)
print(features_selected.shape)
selected_features_indices = kbest.get_support(indices=True)
features_selected = [features_list[i] for i in selected_features_indices]
features_selected

In [None]:
test_classifier(lr_clf, train, features_selected)

In [None]:
test_classifier(lr_clf, train, features_list)

In [None]:
test_classifier(lsvc_clf, train, features_selected)

In [None]:
test_classifier(lsvc_clf, train, features_list)

### Setup a pipeline with hyperparameter tuning

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
select = SelectKBest()
lr = LogisticRegression()


steps = [('feature_select', select),
         ('classifier', lr)]

param_grid = {'feature_select__k': range(0,9),
             'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
             'classifier__penalty': ['l1', 'l2'],
             'classifier__solver': ['liblinear', 'lbfgs', 'sag'],
             'classifier__class_weight': [None, 'balanced', {0: 1, 1: 5}],
             'classifier__multi_class': ['ovr', 'multinomial'],
             'classifier__tol': [1e-4, 1e-3, 1e-2],
             'classifier__max_iter': [100, 1000, 10000],
             'classifier__warm_start': [True, False]}
sss = StratifiedShuffleSplit(100, test_size=0.3, random_state=43)
pipe = Pipeline(steps)
gs = GridSearchCV(pipe, param_grid, cv=sss, scoring='f1')
gs.fit(X_train, y_train)
clf = gs.best_estimator_