# Cancel Culture - Classification Modeling Notebook

---

**Post-Cleaning Modeling Notebook**

---

# 📦 Imports

In [None]:
## Data Handling
import pandas as pd
import numpy as np

## Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

## SKLearn
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, \
                                AdaBoostClassifier,GradientBoostingClassifier 
from sklearn import set_config
set_config(display='diagram')

## Settings
%matplotlib inline
plt.style.use('seaborn-talk')
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: f'{x:,.2f}')
pd.set_option('max_rows', 50)

## Personal functions
from bmc_functions import classification as clf

In [None]:
%load_ext autoreload
%autoreload 2

# 👓 Reading the DataFrames

In [None]:
city_full = pd.read_pickle('./data/reservation_city_unfiltered.pickle',
                           compression = 'gzip')
city_full.head(5)

In [None]:
city_fltrd = pd.read_pickle('./data/reservation_city_filtered.pickle',
                            compression = 'gzip')
city_fltrd.head(5)

In [None]:
resort_full = pd.read_pickle('./data/reservation_resort_unfiltered.pickle',
                             compression = 'gzip')
resort_full.head(5)

In [None]:
resort_fltrd = pd.read_pickle('./data/reservation_resort_filtered.pickle',
                              compression = 'gzip')
resort_fltrd.head(5)

# Train/Test Split

In [None]:
## Identifying target
target= 'is_canceled'

In [None]:
X = city_full.drop(columns = target).copy()
y = city_full[target].copy()

In [None]:
X.shape[0] == y.shape[0]

In [None]:
## Splitting - stratify to maintain class balance b/t X_train/_test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25, 
                                                    random_state=42, 
                                                    stratify=y)

In [None]:
## Specifying numeric columns for preprocessing
num_cols = X_train.select_dtypes('number').columns.to_list()
num_cols

In [None]:
## Specifying numeric columns for preprocessing
cat_cols = X_train.select_dtypes(include='object').columns.to_list()
cat_cols

In [None]:
## Creating ColumnTransformer and sub-transformers for imputation and encoding

### --- Creating column pipelines --- ###

cat_pipe = Pipeline(steps=[('ohe', OneHotEncoder(handle_unknown='ignore',
                                                 sparse=False))])

num_pipe = Pipeline(steps=[('scaler', StandardScaler())])

## Instantiating the ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[('num', num_pipe, num_cols),
                  ('cat', cat_pipe, cat_cols)])

preprocessor

In [None]:
## Fitting feature preprocessor
preprocessor.fit(X_train)

## Getting feature names from OHE
ohe_cat_names = preprocessor.named_transformers_['cat'].named_steps['ohe'].get_feature_names(cat_cols)

## Generating list for column index
final_cols = [*num_cols, *ohe_cat_names]

final_cols

In [None]:
## Transform the data via the ColumnTransformer preprocessor

X_train_tf = preprocessor.transform(X_train)
X_train_tf_df = pd.DataFrame(X_train_tf, columns=final_cols, index=X_train.index)

X_test_tf = preprocessor.transform(X_test)
X_test_tf_df = pd.DataFrame(X_test_tf, columns=final_cols, index=X_test.index)

display(X_train_tf_df.head(5),X_test_tf_df.head(5))

# 📊 **Baseline Model**

In [None]:
## Creating baseline classifier model

base = DummyClassifier(strategy='stratified', random_state = 42)

base.fit(X_train_tf_df, y_train)

clf.evaluate_classification(base,X_train = X_train_tf_df, y_train = y_train,
                           X_test = X_test_tf_df, y_test = y_test, 
                           metric = 'accuracy')

# `sklearn.metrics.get_scorer()`

In [None]:
## Making test results for testing
y_hat_train = base.predict(X_train_tf_df)
y_hat_test = base.predict(X_test_tf_df)

In [None]:
## Inspecting results of get scorer
score_function = metrics.get_scorer('recall_macro')
score_function

In [None]:
## testing score function
score_function(base, y_train, y_hat_train)

In [None]:
## testing score function
score_function(base, y_test, y_hat_test)

---

Not sure why getting 1.0 results when using score function. Must be error.

Create DF from clf_rpt; add blank col (then .T to turn into row); then remove dup values from precision/recall for accuracy/

---

In [None]:
## Getting clf rpt as dict -> df
cr_df = pd.DataFrame(metrics.classification_report(y_test, y_hat_test,
                                                   output_dict=True))

## adding blank col to df
cr_df.insert(2,column=" ", value=" ")
                     
cr_df

In [None]:
# ## Replacing dupe values in accuracy to match report function
# cr_df = cr_df.T.loc['accuracy',['precision', 'recall']] = ' '

In [None]:
# cr_df