# Working Notebook

__Phase 3 Project - Chicago Traffic Crash Classification__

### Business Understanding

### Data Understanding and Preparation

Load dependencies

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats as stats
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV,\
cross_val_score, RandomizedSearchCV


In [5]:
import warnings
warnings.filterwarnings(action='once')

Import data files

In [6]:
crashes = pd.read_csv('Chicago-Crashes/data/Traffic_Crashes_-_Crashes.csv', low_memory=False)
people = pd.read_csv('Chicago-Crashes/data/Traffic_Crashes_-_People.csv', low_memory=False)
#vehicles = pd.read_csv('Chicago-Crashes/data/Traffic_Crashes_-_Vehicles.csv', low_memory=False)

  and should_run_async(code)


FileNotFoundError: [Errno 2] No such file or directory: 'Chicago-Crashes/data/Traffic_Crashes_-_Crashes.csv'

In [7]:
crashes.info()

NameError: name 'crashes' is not defined

In [5]:
people.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1195747 entries, 0 to 1195746
Data columns (total 30 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   PERSON_ID              1195747 non-null  object 
 1   PERSON_TYPE            1195747 non-null  object 
 2   CRASH_RECORD_ID        1195747 non-null  object 
 3   RD_NO                  1185613 non-null  object 
 4   VEHICLE_ID             1172146 non-null  float64
 5   CRASH_DATE             1195747 non-null  object 
 6   SEAT_NO                244512 non-null   float64
 7   CITY                   879728 non-null   object 
 8   STATE                  890189 non-null   object 
 9   ZIPCODE                803192 non-null   object 
 10  SEX                    1177665 non-null  object 
 11  AGE                    852450 non-null   float64
 12  DRIVERS_LICENSE_STATE  705542 non-null   object 
 13  DRIVERS_LICENSE_CLASS  608027 non-null   object 
 14  SAFETY_EQUIPMENT  

Remove columns with 80% or more of null values.

In [6]:
df_list=[crashes, people]
for df in df_list:
    min_count =  int((20/100)*df.shape[0] + 1)
    df = df.dropna(axis=1, thresh=min_count, inplace=True)

Drop columns that will not be used

In [7]:
crashes_mod = crashes.drop(['CRASH_DATE','RD_NO','REPORT_TYPE', 'DATE_POLICE_NOTIFIED', 'STREET_NO', 
              'STREET_DIRECTION', 'STREET_NAME', 'MOST_SEVERE_INJURY', 'INJURIES_TOTAL', 'INJURIES_FATAL',
             'INJURIES_INCAPACITATING', 'INJURIES_NON_INCAPACITATING', 'INJURIES_REPORTED_NOT_EVIDENT', 
             'INJURIES_NO_INDICATION', 'INJURIES_UNKNOWN'], axis =1)

  and should_run_async(code)


In [8]:
people_mod = people.drop(['CITY', 'ZIPCODE','RD_NO'], axis =1)

In [9]:
people_mod.shape

(1195747, 19)

In [10]:
crashes_mod.shape

(541142, 26)

## First Simple Model

In [11]:
crashes.MOST_SEVERE_INJURY.value_counts()

NO INDICATION OF INJURY     468994
NONINCAPACITATING INJURY     39675
REPORTED, NOT EVIDENT        21763
INCAPACITATING INJURY         9040
FATAL                          547
Name: MOST_SEVERE_INJURY, dtype: int64

Set up target variable:
 - 0: NO INDICATION OF INJURY, NONINCAPACITATING INJURY, REPORTED, NOT EVIDENT
 - 1: INCAPACITATING INJURY, FATAL
 

In [12]:
crashes['TARGET']= crashes['MOST_SEVERE_INJURY'].map({'NO INDICATION OF INJURY': 0,
                                                      'NONINCAPACITATING INJURY': 0,
                                                      'REPORTED, NOT EVIDENT': 0,
                                                      'INCAPACITATING INJURY': 1,
                                                      'FATAL': 1})

In [13]:
crashes.TARGET.fillna(0, inplace=True)

In [14]:
crashes.TARGET.value_counts()

0.0    531555
1.0      9587
Name: TARGET, dtype: int64

In [15]:
from sklearn.model_selection import train_test_split
X = crashes.drop(['MOST_SEVERE_INJURY','INJURIES_TOTAL', 'INJURIES_FATAL','INJURIES_INCAPACITATING',
                      'INJURIES_NON_INCAPACITATING', 'INJURIES_REPORTED_NOT_EVIDENT',
                      'INJURIES_NO_INDICATION', 'TARGET', 'CRASH_RECORD_ID', 'CRASH_DATE', 'LATITUDE', 
                      'LONGITUDE','LOCATION'], axis=1)
y = crashes['TARGET']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [16]:
X_train_nums = X_train.select_dtypes(include=['float64', 'int64'])
X_train_cat = X_train.select_dtypes('object')

In [17]:
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="median")),
    ('ss', StandardScaler())
])
                
categorical_pipeline = Pipeline(steps=[
    ('ohe', OneHotEncoder(drop='first',
                         sparse=False))
])

trans = ColumnTransformer(transformers=[
    ('numerical', numerical_pipeline, X_train_nums.columns),
    ('categorical', categorical_pipeline, X_train_cat.columns)
])

In [18]:
model_pipe = Pipeline(steps=[
    ('trans', trans),
    ('dc', DummyClassifier(strategy='most_frequent'))
])

In [19]:
#model_pipe.fit(X_train, y_train)

In [20]:
#model_pipe.score(X_train, y_train)

Add cross val score

### Merging Dataframes

From the people data, the target will be the injury classification so that the model will predict the severity of injuries.

In [21]:
people_mod.INJURY_CLASSIFICATION.value_counts()

NO INDICATION OF INJURY     1096283
NONINCAPACITATING INJURY      55296
REPORTED, NOT EVIDENT         32092
INCAPACITATING INJURY         10827
FATAL                           656
Name: INJURY_CLASSIFICATION, dtype: int64

To avoid too many repeated rows from crashes_dropped_df, we'll use the `CRASH_RECORD_ID` to only merge in rows from people_dropped_df where `DRIVER_TYPE == DRIVER`, then use the `VEHICLE_ID` to only merge in that driver's vehicle from vehicles_dropped_df.

This will allow us to still retain over 75% of the rows from people.

In [22]:
people_mod.PERSON_TYPE.value_counts(normalize=True)

DRIVER                 0.776163
PASSENGER              0.204485
PEDESTRIAN             0.011393
BICYCLE                0.006911
NON-MOTOR VEHICLE      0.000866
NON-CONTACT VEHICLE    0.000181
Name: PERSON_TYPE, dtype: float64

In [23]:
people_crashes_df = people_mod.merge(crashes_mod, on='CRASH_RECORD_ID', how='left')

In [24]:
people_crashes_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1195747 entries, 0 to 1195746
Data columns (total 44 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   PERSON_ID                1195747 non-null  object 
 1   PERSON_TYPE              1195747 non-null  object 
 2   CRASH_RECORD_ID          1195747 non-null  object 
 3   VEHICLE_ID               1172146 non-null  float64
 4   CRASH_DATE               1195747 non-null  object 
 5   SEAT_NO                  244512 non-null   float64
 6   STATE                    890189 non-null   object 
 7   SEX                      1177665 non-null  object 
 8   AGE                      852450 non-null   float64
 9   DRIVERS_LICENSE_STATE    705542 non-null   object 
 10  DRIVERS_LICENSE_CLASS    608027 non-null   object 
 11  SAFETY_EQUIPMENT         1192258 non-null  object 
 12  AIRBAG_DEPLOYED          1173141 non-null  object 
 13  EJECTION                 1181206 non-null 

In [25]:
people_crashes_df.INJURY_CLASSIFICATION.value_counts()

NO INDICATION OF INJURY     1096283
NONINCAPACITATING INJURY      55296
REPORTED, NOT EVIDENT         32092
INCAPACITATING INJURY         10827
FATAL                           656
Name: INJURY_CLASSIFICATION, dtype: int64

In [26]:
people_crashes_df['TARGET'] = people_crashes_df['INJURY_CLASSIFICATION'].map({np.nan: 0,
                                                      'NO INDICATION OF INJURY': 0,
                                                      'NONINCAPACITATING INJURY': 1,
                                                      'REPORTED, NOT EVIDENT': 1,
                                                      'INCAPACITATING INJURY': 2,
                                                      'FATAL': 3})

In [27]:
people_crashes_df['CRASH_YEAR']=pd.to_datetime(people_crashes_df['CRASH_DATE']).dt.year

In [28]:
people_crashes_df['CRASH_YEAR'].value_counts()

2018    265694
2019    263972
2020    202084
2017    185328
2021    161704
2016     96020
2015     20931
2014        11
2013         3
Name: CRASH_YEAR, dtype: int64

In [29]:
# Drop nulls from BEAT_OF_OCCURRENCE
people_crashes_df.dropna(subset = ['BEAT_OF_OCCURRENCE'], inplace=True)

# Drop ages < 0 and ages = 0 for DRIVER
people_crashes_df.loc[people_crashes_df.AGE < 0, 'AGE'] = np.nan
people_crashes_df.loc[(people_crashes_df.AGE == 0) & (people_crashes_df.PERSON_TYPE == 'DRIVER'), 'AGE'] = np.nan
people_crashes_df.dropna(subset = ['AGE'], inplace=True)

# Drop POSTED_SPEED_LIMIT = 0 or not divisible by 5
people_crashes_df.loc[people_crashes_df.POSTED_SPEED_LIMIT == 0, 'POSTED_SPEED_LIMIT'] = np.nan
people_crashes_df.loc[(people_crashes_df.POSTED_SPEED_LIMIT % 5) != 0, 'POSTED_SPEED_LIMIT'] = np.nan
people_crashes_df.dropna(subset = ['POSTED_SPEED_LIMIT'], inplace=True)

__Train Test Split__

In [30]:
from sklearn.model_selection import train_test_split
X = people_crashes_df.drop(['PERSON_ID', 'CRASH_RECORD_ID','LANE_CNT', 'VEHICLE_ID','CRASH_DATE', 'CRASH_TYPE', 'LATITUDE', 
                            'INJURY_CLASSIFICATION','DRIVERS_LICENSE_STATE','LONGITUDE','LOCATION', 'TARGET'], axis=1)
y = people_crashes_df['TARGET']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

  and should_run_async(code)


__Reduce the features based on iterations of models__

In [31]:
X_train_smaller = X_train[['PERSON_TYPE', 'SEX', 'SAFETY_EQUIPMENT', 'AGE', 'AIRBAG_DEPLOYED', 
                           'EJECTION', 'DRIVER_ACTION', 'SEAT_NO', 'LIGHTING_CONDITION', 'WEATHER_CONDITION', 
                           'FIRST_CRASH_TYPE', 'TRAFFICWAY_TYPE', 'DAMAGE', 'PRIM_CONTRIBUTORY_CAUSE', 
                           'BEAT_OF_OCCURRENCE', 'NUM_UNITS']]

  and should_run_async(code)


In [32]:
# X_test_smaller = X_test[['PERSON_TYPE', 'SEX', 'SAFETY_EQUIPMENT', 'AGE', 'AIRBAG_DEPLOYED', 
#                            'EJECTION', 'DRIVER_ACTION', 'SEAT_NO', 'LIGHTING_CONDITION', 'WEATHER_CONDITION', 
#                            'FIRST_CRASH_TYPE', 'TRAFFICWAY_TYPE', 'DAMAGE', 'PRIM_CONTRIBUTORY_CAUSE', 
#                            'BEAT_OF_OCCURRENCE', 'NUM_UNITS']]

In [33]:
# for i in range(len(X_train.columns)):
#     print(X_train[X_train.columns[i]].value_counts())
#     print(X_train[X_train.columns[i]].isna().sum())
#     print('--------------------')

__Create functions to clean the data__

In [34]:
from sklearn.preprocessing import FunctionTransformer

def seat_no_transformer(df):
    '''
    Set all SEAT_NO = 1 if PERSON_TYPE = 'DRIVER'
    and fill the rest with 0.
    '''
    df.loc[(df.PERSON_TYPE == 'DRIVER'), 'SEAT_NO'] = 1
    df['SEAT_NO'].fillna(value=0, inplace=True)
    return df

In [35]:
# def age_transformer(df):
#     df['AGE'][df.AGE < 0] = np.nan
#     df.loc[(df.AGE==0)&(df.PERSON_TYPE == 'DRIVER')]['AGE'] = np.nan
#     df.AGE.dropna(inplace=True)
#     return df

In [36]:
# def speed_transformer(df):
#     df['POSTED_SPEED_LIMIT'][df.POSTED_SPEED_LIMIT == 0] = np.nan
#     df['POSTED_SPEED_LIMIT'][(df.POSTED_SPEED_LIMIT %5) != 0] = np.nan
#     df.POSTED_SPEED_LIMIT.dropna(inplace=True)
#     return df

In [37]:
def beat_transformer(df):
    '''
    Extract and encode as a string the district
    from BEAT_OF_OCCURRENCE.
    '''
    df['BEAT_OF_OCCURRENCE'] = df['BEAT_OF_OCCURRENCE'].apply(str)
    df['BEAT_OF_OCCURRENCE'] = df['BEAT_OF_OCCURRENCE'].apply(lambda x: x[:-4])
    # df.drop('BEAT_OF_OCCURRENCE', axis=1, inplace=True)
    return df

In [38]:
def license_class_transformer(df):
    '''
    Bin all license classes into A, B, C, D, and OTHER
    '''
    dl_classes = ['A', 'B', 'C', 'D', np.nan]
    df.loc[df['DRIVERS_LICENSE_CLASS'].isin(dl_classes) == False, 'DRIVERS_LICENSE_CLASS'] = 'OTHER'
    return df

In [39]:
def to_float_transformer(df):
    '''
    Ensure all columns are float, not int.
    '''
    for col in [df.select_dtypes('int64').columns]:
        df[col] = df[col].astype('float64')
    return df

In [40]:
def fill_categories(df):
    '''
    Fill null values with given value for 
    unknown values in that column.
    Drop all remaining nulls.
    '''
    df.fillna({#'STATE':'XX', 
               # 'DRIVERS_LICENSE_STATE':'XX', # This col now dropped before train-test-split
               #'DRIVERS_LICENSE_CLASS': 'D', # Most common; D = 'normal' drivers license for cars
               'EJECTION': 'UNKNOWN',
               'DRIVER_ACTION': 'UNKNOWN',
               #'DRIVER_VISION': 'UNKNOWN',
               #'PHYSICAL_CONDITION': 'UNKNOWN',
               'SAFETY_EQUIPMENT': 'USAGE_UNKNOWN',
               'AIRBAG_DEPLOYED': 'DEPLOYMENT_UNKNOWN',
               'SEX': 'UNKNOWN',
               #'INTERSECTION_RELATED_I': 'N',
               #'HIT_AND_RUN_I': 'N',
               #'BAC_RESULT': 'TEST NOT OFFERED'
              }, inplace=True)
    return df

In [41]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.compose import ColumnTransformer

ohe_cols = list(X_train_smaller.select_dtypes('object').columns)
ohe_cols.append('BEAT_OF_OCCURRENCE')


# OneHotEncode the columns as part of cleaning
# to avoid needing to separate numerical and categorical
# columns later
ohe_col_trans = ColumnTransformer(transformers=
                                     [('ohe', OneHotEncoder(drop='first', sparse=False), 
                                       ohe_cols)],
                                 remainder='passthrough')

# Apply all our cleaning functions, then finish with ohe
cleaning_pipeline = Pipeline(steps=[
    ('seat_no', FunctionTransformer(seat_no_transformer)),
#     ('age', FunctionTransformer(age_transformer)),
    ('beat', FunctionTransformer(beat_transformer)),
#     ('speed', FunctionTransformer(speed_limit_transformer)),
#     ('license', FunctionTransformer(license_class_transformer)),
    ('fill_cat', FunctionTransformer(fill_categories)),
    ('float', FunctionTransformer(to_float_transformer)),
    ('col_trains', ohe_col_trans)
])

__Fit the cleaning pipeline to prepare data for a model__

In [42]:
#X_train_clean = cleaning_pipeline.fit_transform(X_train)
#X_test_clean = cleaning_pipeline.transform(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['BEAT_OF_OCCURRENCE'] = df['BEAT_OF_OCCURRENCE'].apply(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documenta

In [43]:
X_train_smaller_clean = cleaning_pipeline.fit_transform(X_train_smaller)
#X_test_smaller_clean = cleaning_pipeline.transform(X_test)

  and should_run_async(code)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['BEAT_OF_OCCURRENCE'] = df['BEAT_OF_OCCURRENCE'].apply(str)


__Use SMOTE to resample to deal with the class imbalance__

In [44]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train_smaller_clean, y_train)

  and should_run_async(code)


__LOGISTIC REGRESSION - ITERATION 1__

In [52]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

lr_pipe1 = Pipeline(steps=[
    ('ss', StandardScaler()),
    ('lr', LogisticRegression(class_weight='balanced'))
])

In [46]:
from sklearn.model_selection import cross_validate

def print_cv_scores(pipe, X, y):
    
    # we pass in pipe to cross validate along with a feature list.
    results = cross_validate(pipe, X, 
                                   y, 
                                   return_train_score=True)
    
    print(results['train_score'])
    print(results['train_score'].mean())
    print('##############')
    print(results['test_score'])
    print(results['test_score'].mean())

In [47]:
print_cv_scores(lr_pipe1, X_train_smaller_clean, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[0.74748576 0.74876755 0.74943934 0.74928585 0.74927638]
0.7488509759823703
##############
[0.74679255 0.75107447 0.74639386 0.74685634 0.74993222]
0.7482098857870877


In [53]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import plot_confusion_matrix, classification_report, precision_recall_fscore_support

y_pred = cross_val_predict(lr_pipe1, X_train_smaller_clean, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [55]:
confusion_matrix(y_train, y_pred, normalize='true')

  and should_run_async(code)


array([[0.79395828, 0.14937041, 0.03379528, 0.02287603],
       [0.29256945, 0.3720424 , 0.22806726, 0.10732088],
       [0.15588433, 0.24761264, 0.35722932, 0.23927371],
       [0.03791469, 0.05687204, 0.22274882, 0.68246445]])

In [56]:
precision_recall_fscore_support(y_train, y_pred, average='macro')

  and should_run_async(code)


(0.31395005064088854, 0.5514236147732384, 0.3215327917780622, None)

__LOGISTIC REGRESSION - ITERATION 2__

In [58]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

lr_pipe2 = Pipeline(steps=[
    ('ss', StandardScaler()),
    ('lr', LogisticRegression(class_weight='balanced',
                              max_iter=1000,
                              random_state=42,
                              C = 0.001))
])

In [59]:
print_cv_scores(lr_pipe2, X_train_smaller_clean, y_train)

[0.74921209 0.75071515 0.75098227 0.75160822 0.75120603]
0.7507447522616543
##############
[0.74898534 0.75282073 0.74794077 0.74941592 0.75196555]
0.7502256611709548


In [60]:
y_pred = cross_val_predict(lr_pipe2.steps[1][1], X_train_smaller_clean, y_train)

  and should_run_async(code)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentati

In [61]:
confusion_matrix(y_train, y_pred, normalize='true')

  and should_run_async(code)


array([[0.80079891, 0.14168365, 0.03083113, 0.02668632],
       [0.30413399, 0.34354646, 0.22964575, 0.1226738 ],
       [0.16785474, 0.22434432, 0.34418292, 0.26361802],
       [0.04976303, 0.05924171, 0.18246445, 0.70853081]])

In [62]:
precision_recall_fscore_support(y_train, y_pred, average='macro')

  and should_run_async(code)


(0.31244886950380907, 0.5492647724802594, 0.3191111100095438, None)

__LOGISTIC REGRESSION - ITERATION 3 - Using SMOTE__

In [1]:
lr_pipe3 = Pipeline(steps=[
    ('ss', StandardScaler()),
    ('lr', LogisticRegression(class_weight='balanced',
                              max_iter=1000,
                              random_state=42,
                              C = 0.01))
])

NameError: name 'Pipeline' is not defined

In [None]:
print_cv_scores(lr_pipe3, X_res, y_res)

In [None]:
y_pred = cross_val_predict(lr_pipe3.steps[1][1], X_res, y_train)

In [None]:
confusion_matrix(y_res, y_pred, normalize='true')

In [None]:
precision_recall_fscore_support(y_res, y_pred, average='macro')

In [None]:
# def print_logreg_coefs(cleaning_pipeline, model_pipeline, thresh=0.05, return_list=False):
#     ohe_feature_names = cleaning_pipeline.steps[-1][1].get_feature_names()
#     lr_coefs = model_pipeline.steps[1][1].coef_
#     result = []
    
#     for i in range(len(lr_coefs)):
#         print('Coefs for features in class ', i)
#         for j in range(len(ohe_feature_names)):
#             if abs(lr_coefs[i][j] >= thresh):
#                 print(ohe_feature_names[j], ': ', lr_coefs[i][j])
#                 result.append(ohe_feature_names[j], lr_coefs[i][j])
#         print('='*30, '\n')
        
#     if return_list:
#         return result
#     else:
#         return

In [None]:
# print_logreg_coefs(cleaning_pipeline, model_pipe, thresh=0.05, return_list=False)

In [None]:
# model_pipe = Pipeline(steps=[
#     ('ss', StandardScaler()),
#     ('lr', LogisticRegression(random_state=42))
# ])

In [None]:
# pipe_grid = {
#     'lr__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
#     'lr__penalty': ['l1', 'l2'],
#     'lr__max_iter': list(range(100,800,100)),
#     'lr__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
#     'lr__class_weight': ['balanced']
# }
# gs_pipe = GridSearchCV(estimator=model_pipe, param_grid=pipe_grid)

In [None]:
# gs_pipe.fit(X_train_clean, y_train)

In [None]:
# gs_pipe.best_params_
# print('Mean Accuracy: %.3f' % gs_pipe.best_score_)
# print('Config: %s' % gs_pipe.best_params_)

__RANDOM FOREST - ITERATION 1__

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_pipe = Pipeline(steps=[
    ('ss', StandardScaler()),
    ('rf', RandomForestClassifier(max_depth=4))
])

In [None]:
print_cv_scores(rf_pipe, X_res, y_res)

In [None]:
y_pred = cross_val_predict(rf_pipe.steps[1][1], X_res, y_res)

In [None]:
confusion_matrix(y_res, y_pred, normalize='true')

In [None]:
precision_recall_fscore_support(y_res, y_pred, average='macro')

In [None]:
# rf_grid = {
#     'rf__n_estimators': [50, 100, 200],
#     'rf__max_depth': [4, 5, 6],
# }
# gs_pipe_rf = GridSearchCV(estimator=rf_pipe, param_grid=rf_grid)