In [130]:
import pandas as pd
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [131]:
df = sns.load_dataset('titanic')

sdf = df[['age', 'sex', 'pclass', 'embark_town', 'survived']].dropna()
sdf

Unnamed: 0,age,sex,pclass,embark_town,survived
0,22.0,male,3,Southampton,0
1,38.0,female,1,Cherbourg,1
2,26.0,female,3,Southampton,1
3,35.0,female,1,Southampton,1
4,35.0,male,3,Southampton,0
...,...,...,...,...,...
885,39.0,female,3,Queenstown,0
886,27.0,male,2,Southampton,0
887,19.0,female,1,Southampton,1
889,26.0,male,1,Cherbourg,1


In [132]:
sdf.isna().sum()

age            0
sex            0
pclass         0
embark_town    0
survived       0
dtype: int64

In [133]:
X = sdf.drop(columns='survived', axis=1)
X

Unnamed: 0,age,sex,pclass,embark_town
0,22.0,male,3,Southampton
1,38.0,female,1,Cherbourg
2,26.0,female,3,Southampton
3,35.0,female,1,Southampton
4,35.0,male,3,Southampton
...,...,...,...,...
885,39.0,female,3,Queenstown
886,27.0,male,2,Southampton
887,19.0,female,1,Southampton
889,26.0,male,1,Cherbourg


In [134]:
X.shape

(712, 4)

In [135]:
y = sdf['survived']
y

0      0
1      1
2      1
3      1
4      0
      ..
885    0
886    0
887    1
889    1
890    0
Name: survived, Length: 712, dtype: int64

In [136]:
y.shape

(712,)

In [137]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import make_column_transformer

In [138]:
ohe = OneHotEncoder()
vect = CountVectorizer()
ct = make_column_transformer((ohe, ['sex']), (vect, 'embark_town'))

In [139]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver='liblinear', random_state=1)

In [140]:
from sklearn.pipeline import make_pipeline
tabular_pl = make_pipeline(ct, clf)

---

### ***GridSearchCV taking too long? try RandomizedSearchCV with a small number of iterations.***
<br>***make sure to specify a distribution (instead of a list of values) for continuous parameters!***

---

In [141]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

In [142]:
text_pl = make_pipeline(CountVectorizer(), MultinomialNB())

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(text_pl, X, y, cv=5, scoring='accuracy', error_score='raise').mean()

In [None]:
#specify parameters values to search (use 'distribution' for any continuous parameters)
import scipy as sp
params = {}
params['countvectorizer__min_df'] = [1,2, 3, 3]
params['countvectorizer__lowercase'] = [True, False]

params['multinomialnb__alpha'] = sp.stats.uniform(scale=1) #

In [None]:
#try "n-iter" random combination of those parameter values
from sklearn.model_selection import RandomizedSearchCV
rand = RandomizedSearchCV(
    text_pl,
    params,
    n_iter=10, #
    cv=5, scoring='accuracy', random_state=1)
rand.fit(X,y)

In [None]:
#what was the best score found during the search?
rand.best_score_

In [None]:
#which combination of parameteres produces the best score?
rand.best_params_