In [1]:
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Titanic Dataset

## Import dataset

In [6]:
!pip install xlrd



In [7]:
url = "https://biostat.app.vumc.org/wiki/pub/Main/DataSets/titanic3.xls"
df = pd.read_excel(url)
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


## Drop Columns

In [8]:
df = df.drop(columns=["name","ticket","home.dest","boat", "body", "cabin",])
df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked
0,1,1,female,29.0,0,0,211.3375,S
1,1,1,male,0.9167,1,2,151.55,S
2,1,0,female,2.0,1,2,151.55,S
3,1,0,male,30.0,1,2,151.55,S
4,1,0,female,25.0,1,2,151.55,S


## Split Data

In [9]:
X = df.drop(columns="survived")
y = df.survived

## Preprocess data

In [11]:
num_col = X.select_dtypes(exclude=['object']).columns.tolist()
num_col

['pclass', 'age', 'sibsp', 'parch', 'fare']

In [13]:
cat_col = X.select_dtypes(include=['object']).columns.tolist()
cat_col

['sex', 'embarked']

In [14]:
X_train, X_test, y_train,y_test = train_test_split(X,y,
                                                  test_size=0.30,random_state=42)

# Pipeline

In [16]:
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer,KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
import numpy as np
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier

## Making a numeric preprocessor

In [17]:
numeric_preprocessor = Pipeline(
    steps=[
        ('Knn_imputer', KNNImputer(n_neighbors=3)),
        ('Scaler', StandardScaler())])

In [19]:
categorical_preprocessor = Pipeline(
    steps=[
        (('imputer', SimpleImputer(missing_values=np.nan,strategy='constant'))),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [24]:
preprocessor =ColumnTransformer(
                    [
                    ('categorical', categorical_preprocessor, cat_col),
                    ('numerical', numeric_preprocessor, num_col),
                    ])

In [25]:
pipe = make_pipeline(preprocessor, PCA(n_components=0.90),
                                        RandomForestClassifier())

In [26]:
pipe 

In [27]:
clf = pipe.fit(X,y)

In [28]:
clf.score(X,y)

0.9663865546218487

## Randomized Search CV

In [29]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [None]:
parameters = {'n_estimators':[100,200,300],
             'max_depth':[3,6,10],
             'criterion': ['gini','entropy'],
             'max_features': ['sqrt', 'log2'],
             'bootstrap': [False, True]}

In [30]:
parameters = {'randomforestclassifier__n_estimators':[100,200,300],
             'randomforestclassifier__max_depth':[3,6,10],
             'randomforestclassifier__max_features': ['sqrt', 'log2'],
             'randomforestclassifier__criterion': ['gini', 'entropy']}

In [31]:
grid_search= GridSearchCV(pipe, param_grid=parameters,cv=5)

In [32]:
grid_search

In [33]:
grid_search.fit(X,y)

In [35]:
grid_search.bests_score_

AttributeError: 'GridSearchCV' object has no attribute 'bests_score_'

In [36]:
grid_search.bests_params_

AttributeError: 'GridSearchCV' object has no attribute 'bests_params_'

In [37]:
grid_search.best_estimator_.named_steps['randomforestclassifier'].feature_importance_

AttributeError: 'RandomForestClassifier' object has no attribute 'feature_importance_'

In [38]:
grid_search.best_estimator_.named_steps['randomforestclassifier'].get_feature_names_out()

AttributeError: 'RandomForestClassifier' object has no attribute 'get_feature_names_out'

In [39]:
grid_search.best_estimator_.named_steps['pca'].components()

AttributeError: 'PCA' object has no attribute 'components'

In [40]:
grid_search.cv_results_

{'mean_fit_time': array([0.62630429, 1.15942335, 1.71905651, 0.62493677, 1.16645708,
        1.70923676, 0.88209682, 1.64750962, 2.43526759, 0.89154763,
        1.65864344, 2.44243717, 1.08224173, 2.08518376, 3.10975585,
        1.08341794, 2.28495493, 3.32537098, 0.67977371, 1.41578994,
        1.87807078, 0.70130072, 1.32052665, 1.87197051, 0.96383862,
        1.83655591, 2.75959911, 1.04157076, 1.92727137, 2.78023772,
        1.19598484, 2.44002881, 3.5567451 , 1.3356658 , 2.43063898,
        3.55629926]),
 'std_fit_time': array([0.01553942, 0.02799166, 0.01239427, 0.01360557, 0.02054675,
        0.01114466, 0.02438973, 0.01958423, 0.04281432, 0.02171493,
        0.02519567, 0.01917629, 0.0338382 , 0.02025939, 0.01909983,
        0.02242998, 0.11604142, 0.22810915, 0.02812125, 0.0940068 ,
        0.02615069, 0.02206813, 0.06680847, 0.01968327, 0.02669784,
        0.03761767, 0.13281992, 0.11874891, 0.07904821, 0.11989318,
        0.01411196, 0.17720002, 0.09970326, 0.1443339 , 0.098

# Reduce Dimensions

In [41]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [43]:
pipe2 = make_pipeline(preprocessor, KMeans(n_clusters=3),
                                        RandomForestClassifier())

In [46]:
grid_search2 =  RandomizedSearchCV(pipe2,parameters,
                                   cv=5).fit(X,y)

In [48]:
grid_search2.best_score_

0.5935918809043315

In [49]:
import pickle

In [50]:
filename = 'bestModel.sav'
pickle.dump(grid_search.best_estimator_, open(filename,'wb'))#'wb' means to write bites

## Randomized Search PCA