In [178]:
#Basic Imports
from sklearn.model_selection import cross_validate, cross_val_score, RandomizedSearchCV, train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df2012 = pd.read_stata('data/SETUPS2012/SETUPS2012.dta')
df2016 = pd.read_stata('data/SETUPS2016/SETUPS2016.dta')
df2020 = pd.read_stata('data/SETUPS2020/SETUPS2020.dta')

In [3]:
df2012.shape, df2016.shape, df2020.shape

((5914, 195), (3649, 204), (7453, 257))

In [4]:
df2012.drop(['CASEID','WEIGHT_FULL'], axis=1, inplace=True)
df2016.drop(['CASEID','WEIGHT'], axis=1, inplace=True)
df2020.drop(['CASEID','WEIGHT'], axis=1, inplace=True)

In [5]:
df2020 = df2020.loc[(df2020['A01'] == '1. Voted') & ((df2020['A02'] == '1. Joe Biden') | (df2020['A02'] == '2. Donald Trump'))]

In [6]:
df2012.shape, df2016.shape, df2020.shape

((5914, 193), (3649, 202), (6075, 255))

In [7]:
y = df2020['A02']
X = df2020.drop(['A02'], axis=1, errors = "ignore")

In [8]:
def get_columns(df):
    dictionary = {}
    import string
    alphabet = list(string.ascii_uppercase[0:26])
    for char in alphabet:
        dictionary[char] = []
        for num in list(range(df.shape[1])):
            if df.columns[num].startswith(char):
                dictionary[char].append(df.columns[num])            
        temp = dictionary.pop(char)
        if temp != []:
            dictionary[char] = temp
    return dictionary

In [9]:
_2012_dictionary = get_columns(df2012)
_2016_dictionary = get_columns(df2016)
_2020_dictionary = get_columns(df2020)

In [10]:
df2020.drop(_2020_dictionary['A'], axis=1, inplace=True, errors = "ignore")
df2020.drop(_2020_dictionary['D'], axis=1, inplace=True)
df2020.drop(_2020_dictionary['E'], axis=1, inplace=True)

In [11]:
_2020_dictionary.pop('A')
_2020_dictionary.pop('D')
_2020_dictionary.pop('E');

In [12]:
# import csv
# with open('test.csv', 'w') as f:
#     for key in _2020_dictionary.keys():
#         f.write("%s,%s\n"%(key,_2020_dictionary[key]))

In [13]:
# Survey_Subset = input("What section of the survey would you like to analyze? ")
Survey_Subset = 'C'

In [14]:
# categorical_columns = _2020_dictionary[Survey_Subset]
# numerical_columns = []

In [131]:
categorical_columns = list(df2020.columns)
numerical_columns = []

# Train Test Split

In [132]:
X = X[categorical_columns + numerical_columns]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [17]:
y_train.value_counts(normalize=True)

1. Joe Biden                    0.5777
2. Donald Trump                 0.4223
9. NA                           0.0000
5. Other candidate {SPECIFY}    0.0000
4. Howie Hawkins                0.0000
3. Jo Jorgensen                 0.0000
Name: A02, dtype: float64

# Dummy Model

Using the uniform strategy for the dumy model should result in a roughly 50/50 chance of getting each of our two choices which is what we see. This will serve as our baseline to compare the following models against

In [18]:
dummy_clf = DummyClassifier(strategy = "uniform")
dummy_clf.fit(X_train, y_train)
dummy_clf.score(X_test, y_test)

0.4950625411454905

# Decision Tree

In [133]:
categorical_processing = OneHotEncoder(handle_unknown='ignore')
numerical_pipe = SimpleImputer(strategy="mean")

preprocessing = ColumnTransformer(
    [
        ("cat", categorical_processing, categorical_columns),
        ("num", numerical_pipe, numerical_columns),
    ],
    verbose_feature_names_out=False,
)

tree_pipe = Pipeline(
    [
        ("preprocess", preprocessing),
        ("classifier", DecisionTreeClassifier(random_state=42)),
    ]
)
tree_pipe.fit(X_train, y_train)

In [134]:
y_pred = tree_pipe.predict(X_train)
#Checking accuracy of predictions
print(f"Training data prediction accuracy: ", accuracy_score(y_train, y_pred))

#getting cross validation score for training data 
print(f"CV train accuracy: {cross_val_score(tree_pipe, X_train, y_train, cv=5, scoring = 'accuracy').mean()}")

Training data prediction accuracy:  1.0
CV train accuracy: 0.9242730178905003


# Tree Second Iteration 

In [179]:
param_grid = {'classifier__criterion': ['gini', 'entropy', 'log_loss'],               
              'classifier__max_depth': [2, 4, 6, 8, 10, 12]
             }

gridsearch = GridSearchCV(estimator=tree_pipe,
                          param_grid=param_grid,
                          scoring='accuracy',
                          cv=5,
                          n_jobs = 3
                         )
# Fit the training data
gridsearch.fit(X_train, y_train)
# Print the accuracy on test set


In [181]:
print(f"Gridsearch score: ", gridsearch.score(X_train, y_train))
print(f"Gridsearch best params: ")
pprint(gridsearch.best_params_)

Gridsearch score:  0.979367866549605
Gridsearch best params: 
{'classifier__criterion': 'gini', 'classifier__max_depth': 8}


### Results
Gridsearch score:  0.979367866549605

Gridsearch best params: 
- 'classifier__criterion': 'gini'
- 'classifier__max_depth': 8

In [182]:
tree_pipe.set_params(classifier__criterion = gridsearch.best_params_['classifier__criterion'],
                     classifier__max_depth = gridsearch.best_params_['classifier__max_depth'],
                    )

tree_pipe.fit(X_train, y_train)


In [183]:
#Getting predictions from pipeline using training data
y_pred = tree_pipe.predict(X_train)

#Checking accuracy of predictions
print(f"Training data prediction accuracy: ", accuracy_score(y_train, y_pred))

#getting cross validation score for training data 
print(f"CV train accuracy: {cross_val_score(tree_pipe, X_train, y_train, cv=5, scoring = 'accuracy').mean()}")

Training data prediction accuracy:  0.979367866549605
CV train accuracy: 0.9381018063820363


# Feature Importance

In [213]:
def get_feature_importances(pipe):
    feature_names = pipe[:-1].get_feature_names_out()
    feature_importances = pd.Series(pipe[-1].feature_importances_, index=feature_names).sort_values(ascending=True)
    importances = feature_importances.to_frame(name = 'importance').reset_index().rename(columns={"index": "feature"})
    importances['feature'] = importances['feature'].str.slice(0, 3)
    importances = importances.groupby('feature').sum()
    return importances

In [214]:
tree_importances = get_feature_importances(tree_pipe)
tree_importances.nlargest(10, columns= 'importance')

Unnamed: 0_level_0,importance
feature,Unnamed: 1_level_1
P28,0.661556
H05,0.109602
K11,0.039734
H04,0.030105
P27,0.018483
H02,0.017921
F26,0.014479
C02,0.013043
K09,0.011524
F27,0.008256


In [184]:
tree_importances.nsmallest(10, columns= 'importance')

Unnamed: 0_level_0,importance
feature,Unnamed: 1_level_1
B01,0.0
B02,0.0
B03,0.0
B04,0.0
B05,0.0
B06,0.0
B07,0.0
B08,0.0
B09,0.0
B10,0.0


As we can see the Tree doesnt take into account a number of feature. Lets see if we can get better results with a Random Forest

# Random Forest

In [33]:
categorical_processing = OneHotEncoder(handle_unknown='ignore')
numerical_pipe = SimpleImputer(strategy="mean")

preprocessing = ColumnTransformer(
    [
        ("cat", categorical_processing, categorical_columns),
        ("num", numerical_pipe, numerical_columns),
    ],
    verbose_feature_names_out=False,
)

forest_pipe = Pipeline(
    [
        ("preprocess", preprocessing),
        ("classifier", RandomForestClassifier(random_state=42)),
    ]
)
forest_pipe.fit(X_train, y_train)

In [20]:
#Getting predictions from pipeline using training data
y_pred = forest_pipe.predict(X_train)
#Checking accuracy of predictions
print(f"Training data prediction accuracy: ", accuracy_score(y_train, y_pred))

#getting cross validation score for training data 
print(f"CV train accuracy: {cross_val_score(forest_pipe, X_train, y_train, cv=5, scoring = 'accuracy').mean()}")

Training data prediction accuracy:  1.0
CV train accuracy: 0.9613702890596414


Thats a pretty good score for an untuned model but it looks like we are slightly overfit. Lets see what we can do about that

# Tuning with RandomizedSearchCV and GridSearchCV

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'classifier__n_estimators': n_estimators,
               'classifier__max_depth': max_depth,
               'classifier__min_samples_split': min_samples_split,
               'classifier__min_samples_leaf': min_samples_leaf,
               'classifier__bootstrap': bootstrap}

In [185]:
forest = forest_pipe
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
forest_random = RandomizedSearchCV(estimator = forest,
                                   param_distributions = random_grid,
                                   n_iter = 100,
                                   cv = 3,
                                   verbose=2,
                                   random_state=42,
                                   n_jobs = -1
                                  )
# Fit the random search model
forest_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


KeyboardInterrupt: 

In [23]:
forest_random.best_params_

{'classifier__n_estimators': 400,
 'classifier__min_samples_split': 5,
 'classifier__min_samples_leaf': 2,
 'classifier__max_depth': 90,
 'classifier__bootstrap': False}

### Results
- 'classifier__n_estimators': 400
- 'classifier__min_samples_split': 5
- 'classifier__min_samples_leaf': 2
- 'classifier__max_depth': 90
- 'classifier__bootstrap': False
 

In [47]:
param_grid = {'classifier__n_estimators': [200, 300, 400],
              'classifier__criterion': ['gini', 'entropy', 'log_loss'],               
              'classifier__max_depth': [70, 80, 90],
              'classifier__min_samples_split': [4, 5, 6],
              'classifier__min_samples_leaf': [2, 3, 4],
              'classifier__bootstrap': [False]
             }

gridsearch = GridSearchCV(estimator=forest_pipe,
                          param_grid=param_grid,
                          scoring='accuracy',
                          cv=5,
                          n_jobs = 3
                         )
# Fit the training data
gridsearch.fit(X_train, y_train)

In [48]:
# Print the accuracy on train set
print(f"Gridsearch score: ", gridsearch.score(X_train, y_train))
print(f"Gridsearch best params: ")
pprint(gridsearch.best_params_)

Gridsearch score:  0.9995610184372257
Gridsearch best params: 
{'classifier__bootstrap': False,
 'classifier__criterion': 'gini',
 'classifier__max_depth': 70,
 'classifier__min_samples_leaf': 2,
 'classifier__min_samples_split': 6,
 'classifier__n_estimators': 400}


### Results

Gridsearch score:  0.9995610184372257

Gridsearch best params: 
- 'classifier__bootstrap': False 
- 'classifier__criterion': 'gini' 
- 'classifier__max_depth': 70 
- 'classifier__min_samples_leaf': 2 
- 'classifier__min_samples_split': 6 
- 'classifier__n_estimators': 400

In [49]:
forest_pipe.set_params(classifier__n_estimators = gridsearch.best_params_['classifier__n_estimators'],
                       classifier__criterion = gridsearch.best_params_['classifier__criterion'],
                       classifier__max_depth = gridsearch.best_params_['classifier__max_depth'],
                       classifier__min_samples_leaf = gridsearch.best_params_['classifier__min_samples_leaf'],
                       classifier__min_samples_split = gridsearch.best_params_['classifier__min_samples_split'],
                       classifier__bootstrap = gridsearch.best_params_['classifier__bootstrap'],
                      )
forest_pipe.fit(X_train, y_train)

In [50]:
#Getting predictions from pipeline using training data
y_pred = forest_pipe.predict(X_train)
#Checking accuracy of predictions
print(f"Training data prediction accuracy: ", accuracy_score(y_train, y_pred))

#getting cross validation score for training data 
print(f"CV train accuracy: {cross_val_score(forest_pipe, X_train, y_train, cv=5, scoring = 'accuracy').mean()}")

Training data prediction accuracy:  0.9995610184372257
CV train accuracy: 0.9655403258420474


Slight improvement in both directions but the model is still clearly overfit. The RandomizedSearchCV suggested that the model preformed best with a max depth of 90 which is high. Because we are worried about overfitting we can try to prune our tree by decreasing the max depth

# Hyperparameter Tuning Second Iteration

In [35]:
param_grid = {'classifier__n_estimators': [50, 100, 150, 200],
              'classifier__criterion': ['gini', 'entropy', 'log_loss'],               
              'classifier__max_depth': [4, 6, 8, 10, 12, 14],
              'classifier__bootstrap': [True, False]
             }

gridsearch = GridSearchCV(estimator=forest_pipe,
                          param_grid=param_grid,
                          scoring='accuracy',
                          cv=5,
                          n_jobs = 3
                         )
# Fit the training data
gridsearch.fit(X_train, y_train)
# Print the accuracy on test set


In [46]:
print(f"Gridsearch score: ", gridsearch.score(X_train, y_train))
print(f"Gridsearch best params: ")
pprint(gridsearch.best_params_)

Gridsearch score:  0.9877085162423178
Gridsearch best params: 
{'classifier__bootstrap': False,
 'classifier__criterion': 'entropy',
 'classifier__max_depth': 8,
 'classifier__n_estimators': 125}


### Results

Gridsearch score:  0.9877085162423178

Gridsearch best params: 
- 'classifier__bootstrap': False
- 'classifier__criterion': 'entropy'
- 'classifier__max_depth': 8
- 'classifier__n_estimators': 125

In [38]:
forest_pipe.set_params(classifier__n_estimators = gridsearch.best_params_['classifier__n_estimators'],
                       classifier__criterion = gridsearch.best_params_['classifier__criterion'],
                       classifier__max_depth = gridsearch.best_params_['classifier__max_depth'],
                       classifier__bootstrap = gridsearch.best_params_['classifier__bootstrap'],
                      )
forest_pipe.fit(X_train, y_train)

In [39]:
#Getting predictions from pipeline using training data
y_pred = forest_pipe.predict(X_train)
#Checking accuracy of predictions
print(f"Training data prediction accuracy: ", accuracy_score(y_train, y_pred))

#getting cross validation score for training data 
print(f"CV train accuracy: {cross_val_score(forest_pipe, X_train, y_train, cv=5, scoring = 'accuracy').mean()}")

Training data prediction accuracy:  0.9958296751536435
CV train accuracy: 0.966637779960329


# Hyperparameter Tuning Third Iteration

In [40]:
param_grid = {'classifier__n_estimators': [25, 50, 75, 100, 125, 150],
              'classifier__criterion': ['gini', 'entropy', 'log_loss'],               
              'classifier__max_depth': [4, 6, 8, 10],
              'classifier__bootstrap': [True, False]
             }

gridsearch = GridSearchCV(estimator=forest_pipe,
                          param_grid=param_grid,
                          scoring='accuracy',
                          cv=5,
                          n_jobs = 3
                         )
# Fit the training data
gridsearch.fit(X_train, y_train)
# Print the accuracy on test set


In [41]:
print(f"Gridsearch score: ", gridsearch.score(X_train, y_train))
print(f"Gridsearch best params: ")
pprint(gridsearch.best_params_)

Gridsearch score:  0.9877085162423178
Gridsearch best params:  {'classifier__bootstrap': False, 'classifier__criterion': 'entropy', 'classifier__max_depth': 8, 'classifier__n_estimators': 125}


### Results

Gridsearch score:  0.9877085162423178

Gridsearch best params:
- 'classifier__bootstrap': False
- 'classifier__criterion': 'entropy' 
- 'classifier__max_depth': 8
- 'classifier__n_estimators': 125

In [186]:
# forest_pipe.set_params(classifier__n_estimators = gridsearch.best_params_['classifier__n_estimators'],
#                        classifier__criterion = gridsearch.best_params_['classifier__criterion'],
#                        classifier__max_depth = gridsearch.best_params_['classifier__max_depth'],
#                        classifier__bootstrap = gridsearch.best_params_['classifier__bootstrap'],
#                       )

forest_pipe.set_params(classifier__n_estimators = 125,
                       classifier__criterion = 'entropy',
                       classifier__max_depth = 8,
                       classifier__bootstrap = False,
                      )

forest_pipe.fit(X_train, y_train)

In [187]:
#Getting predictions from pipeline using training data
y_pred = forest_pipe.predict(X_train)
#Checking accuracy of predictions
print(f"Training data prediction accuracy: ", accuracy_score(y_train, y_pred))

#getting cross validation score for training data 
print(f"CV train accuracy: {cross_val_score(forest_pipe, X_train, y_train, cv=5, scoring = 'accuracy').mean()}")

Training data prediction accuracy:  0.9839771729587358
CV train accuracy: 0.9637842548192654


# TruncatedSVD

In [127]:
forest_pipe_SVD = Pipeline(
    [
        ("preprocess", preprocessing),
        ("SVD", TruncatedSVD(n_components = 200)),
        ("classifier", RandomForestClassifier(max_depth = 12,
                                              n_estimators = 200,
                                              min_samples_split = 5,
                                              min_samples_leaf = 4,
                                              bootstrap = True,
                                              
                                             )
        )
    ]
)
forest_pipe_SVD.fit(X_train, y_train)

In [129]:
param_grid = {'classifier__n_estimators': [175, 200, 225],
              'classifier__max_depth': [8, 10, 12],
              'SVD__n_components' : [10, 100, 200, 300, 1000]
             }

gridsearch = GridSearchCV(estimator=forest_pipe_SVD, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs = 3)
# Fit the training data
gridsearch.fit(X_train, y_train)

In [130]:
gridsearch.best_params_

{'SVD__n_components': 10,
 'classifier__max_depth': 10,
 'classifier__n_estimators': 200}

### Results 

- 'SVD__n_components': 10
- 'classifier__max_depth': 10
- 'classifier__n_estimators': 200

In [None]:
forest_pipe.set_params(SVD__n_components = gridsearch.best_params_['classifier__bootstrap'],
                       classifier__max_depth = gridsearch.best_params_['classifier__max_depth'],
                       classifier__min_samples_leaf = gridsearch.best_params_['classifier__min_samples_leaf'],
                       classifier__min_samples_split = gridsearch.best_params_['classifier__min_samples_split'],
                       classifier__n_estimators = gridsearch.best_params_['classifier__n_estimators'])
forest_pipe.fit(X_train, y_train)

In [131]:
#Getting predictions from pipeline using training data
y_pred = forest_pipe_SVD.predict(X_train)
#Checking accuracy of predictions
print(f"Training data prediction accuracy: ", accuracy_score(y_train, y_pred))

#getting cross validation score for training data 
print(f"CV train accuracy: {cross_val_score(forest_pipe_SVD, X_train, y_train, cv=5, scoring = 'accuracy').mean()}")

Training data prediction accuracy:  0.9835381913959613
CV train accuracy: 0.9534685712635046


# Catagorical Analysis

In [175]:
categorical_processing = OneHotEncoder(handle_unknown='ignore')
numerical_pipe = SimpleImputer(strategy="mean")

preprocessing = ColumnTransformer(
    [
        ("cat", categorical_processing, categorical_columns),
        ("num", numerical_pipe, numerical_columns),
    ],
    verbose_feature_names_out=False,
)

forest_pipe_2 = Pipeline(
    [
        ("preprocess", preprocessing),
        ("classifier", RandomForestClassifier(bootstrap=False,
                                              criterion='entropy',
                                              max_depth=8,
                                              n_estimators=125,
                                              random_state=42
                                             )
        ),
    ]
)

In [176]:
for key in X_train_dict:
    categorical_columns = X_train_dict[key]
    X_train_subset = X_train[categorical_columns]
    forest_pipe_2.set_params(preprocess__transformers = [("cat", categorical_processing, categorical_columns)])
    forest_pipe_2.fit(X_train_subset, y_train)
    y_pred = forest_pipe_2.predict(X_train_subset)
    forest_scores_dict[key] = {'train score' : accuracy_score(y_train, y_pred), 
                               'cross validation score' : cross_val_score(forest_pipe_2, 
                                                                          X_train_subset, 
                                                                          y_train, 
                                                                          cv=5, 
                                                                          scoring = 'accuracy').mean()}

In [192]:
forest_importances = get_feature_importances(forest_pipe)
forest_importances.nlargest(10, columns= 'importance')

Unnamed: 0_level_0,importance
feature,Unnamed: 1_level_1
P28,0.066346
H05,0.060184
K09,0.060059
P29,0.051599
J10,0.049123
N01,0.045507
H03,0.044711
G01,0.041729
H02,0.039745
K11,0.039314


In [193]:
forest_importances.nsmallest(10, columns= 'importance')

Unnamed: 0_level_0,importance
feature,Unnamed: 1_level_1
B10,7e-06
B05,1.8e-05
B09,2.2e-05
B07,2.5e-05
C07,3.4e-05
C03,4.2e-05
B13,5.2e-05
B04,6.1e-05
B17,6.3e-05
R16,6.5e-05


As we can see the Random Forest takes into account all the features. Lets see how the model preforms with the top 10 features in terms of importance

In [231]:
most_important_features = list(forest_importances.nlargest(10, columns= 'importance').index)
most_important_features = ['H03', 'G01', 'H02', 'K11']

In [232]:
categorical_columns = most_important_features
X_train_subset = X_train[categorical_columns]
forest_pipe_2.set_params(preprocess__transformers = [("cat", categorical_processing, categorical_columns)])
forest_pipe_2.fit(X_train_subset, y_train)
y_pred = forest_pipe_2.predict(X_train_subset)
print(f'train score' , accuracy_score(y_train, y_pred))
print(f'cross validation score' , cross_val_score(forest_pipe_2,
                                           X_train_subset, 
                                           y_train, 
                                           cv=5, 
                                           scoring = 'accuracy').mean())

train score 0.9506145741878841
cross validation score 0.9413963352398559


In [221]:
most_important_features = list(get_feature_importances(forest_pipe).nlargest(5, columns= 'importance').index)

In [223]:
categorical_columns = most_important_features
X_train_subset = X_train[categorical_columns]
forest_pipe_2.set_params(preprocess__transformers = [("cat", categorical_processing, categorical_columns)])
forest_pipe_2.fit(X_train_subset, y_train)
y_pred = forest_pipe_2.predict(X_train_subset)
print(f'train score' , accuracy_score(y_train, y_pred))
print(f'cross validation score' , cross_val_score(forest_pipe_2,
                                           X_train_subset, 
                                           y_train, 
                                           cv=5, 
                                           scoring = 'accuracy').mean())

train score 0.9598331870061457
cross validation score 0.9479810599495446


In [233]:
y_pred = forest_pipe_2.predict(X_test[categorical_columns])
print(f'test score' , accuracy_score(y_test, y_pred))

test score 0.9447004608294931


In [227]:
most_important_features = list(get_feature_importances(forest_pipe).nlargest(2, columns= 'importance').index)

In [228]:
categorical_columns = most_important_features
X_train_subset = X_train[categorical_columns]
forest_pipe_2.set_params(preprocess__transformers = [("cat", categorical_processing, categorical_columns)])
forest_pipe_2.fit(X_train_subset, y_train)
y_pred = forest_pipe_2.predict(X_train_subset)
print(f'train score' , accuracy_score(y_train, y_pred))
print(f'cross validation score' , cross_val_score(forest_pipe_2,
                                           X_train_subset, 
                                           y_train, 
                                           cv=5, 
                                           scoring = 'accuracy').mean())

train score 0.9297629499561019
cross validation score 0.9249350049107401


In [234]:
y_pred = forest_pipe_2.predict(X_test[categorical_columns])
print(f'test score' , accuracy_score(y_test, y_pred))

test score 0.9447004608294931


# Decision Tree

In [45]:
OHE = OneHotEncoder(handle_unknown='ignore')

def analyze_each_catagory(data, target, dictionary):
    scores_dict = {}
    for key in dictionary.keys():
        categorical_columns = dictionary[key]
        numerical_columns = []
        data_subset = data[categorical_columns + numerical_columns]
        OHE = OneHotEncoder(handle_unknown='ignore')
        data_subset_encoded = OHE.fit_transform(data_subset)
        data_train, data_test, target_train, target_test = train_test_split(data_subset_encoded, target, random_state=42)
        tree_clf = DecisionTreeClassifier(max_depth=5) 
        tree_clf.fit(data_train, target_train)
        scores = cross_val_score(estimator=tree_clf, X=data_subset_encoded, y=target, cv=5, n_jobs=4)
        scores_dict[key] = scores
    return scores_dict

In [23]:
analyze_each_catagory(X_train, y_train, _2020_dictionary)

{'A': 0.9416168370982341,
 'B': 0.5807718046488339,
 'C': 0.8090444277543474,
 'D': 0.9683926473703469,
 'E': 0.9541291139484276,
 'F': 0.9086929728272383,
 'G': 0.8193579448071331,
 'H': 0.9326142950680764,
 'J': 0.8911322625994185,
 'K': 0.9102292641593005,
 'L': 0.7666796656845187,
 'M': 0.8779604059545131,
 'N': 0.8481134573535926,
 'P': 0.9262541645001636,
 'Q': 0.7890654187609528,
 'R': 0.7396814277736052}