In [806]:
#libraries
import pandas as pd
import numpy as np
from plotnine import ggplot, aes, geom_point, geom_line
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import precision_score, recall_score, roc_auc_score, f1_score, classification_report
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [807]:
#data
train = pd.read_csv('data/CAH-201803-train.csv')

In [808]:
#check if data is balanced
#get counts of each class type
type_counts = train['political_affiliation'].value_counts()
print(type_counts)

#very balances

political_affiliation
Democrat       59
Independent    56
Republican     54
Name: count, dtype: int64
political_affiliation
Democrat       59
Independent    56
Republican     54
Name: count, dtype: int64


In [809]:
#target variable: political_affiliation
# options: Independent, Democrat Republican

**Column Transformers**

In [810]:
#column transformer
ct = ColumnTransformer(
  [
    ("dummify", 
    #handle_unknown: ignore observations for unknown values in category variable
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
).set_output(transform = "pandas")

#get col names
X_1 = ct.fit_transform(X)
#X_1

In [811]:
#interaction
# ColumnTransformer for interactions only
ct_inter = ColumnTransformer(
    [
    ("interaction_1", PolynomialFeatures(degree=2, interaction_only = True, include_bias=False), ["standardize__Q2", "dummify__Q9_No"]),
    #("interaction_2", PolynomialFeatures(degree=2, interaction_only = True, include_bias=False), ["dummify__Q10_No", "standardize__Q17"]),

    ],
    remainder="passthrough"
)


**PipeLines**

In [812]:
#LDA
lr_pipeline_linear_disc = Pipeline(
  [("preprocessing", ct),
  #("interaction", ct_inter),
  ("linear_disc_analysis", LinearDiscriminantAnalysis(solver = "lsqr", shrinkage = "auto"))]
).set_output(transform="pandas")

In [813]:
#LQA
lr_pipeline_quad_disc = Pipeline(
  [("preprocessing", ct),
  #("interaction", ct_inter),
  ("quad_disc_analysis", QuadraticDiscriminantAnalysis(reg_param = 1))]
).set_output(transform="pandas")

In [814]:
#SVC
lr_pipeline_support_vector_class = Pipeline(
  [("preprocessing", ct),
  #("interaction", ct_inter),
  #use kernel = 'linear' like from practice activity, still want to tune C
  ("support_vector_class", SVC(C = 0.1, kernel = 'linear'))]
).set_output(transform="pandas")

In [815]:
#SVM
lr_pipeline_support_vector_machine = Pipeline(
  [("preprocessing", ct),
  #("interaction", ct_inter),
  #use kernel = 'linear' like from practice activity, still want to tune C
  ("support_vector_machine", SVC(C = 1, kernel = 'rbf'))]
).set_output(transform="pandas")

In [816]:
#knn
knn_pipeline = Pipeline([
    ("preprocessing", ct),
    #("interaction", ct_inter),
    ("knn_classifier", KNeighborsClassifier(weights="uniform"))
]).set_output(transform = "pandas")

**Model Testing**

In [817]:
#variable selection
X = train.drop(["political_affiliation", "Q5", "Q7", "Q12", "Q14", "Q16", "Q15", "Q13"], axis = 1)
y = train["political_affiliation"]

In [818]:
#LDA
parameters = {
    "linear_disc_analysis__solver": ["svd", "lsqr", "eigen"],
    "linear_disc_analysis__shrinkage": [None, "auto", 0.1, 0.5, 0.9]}


gscv = GridSearchCV(lr_pipeline_linear_disc, parameters, cv = 5, scoring='f1_macro')

gscv_fitted = gscv.fit(X, y)

params_df = pd.DataFrame(gscv_fitted.cv_results_['params'])

results_df = params_df.assign(scores=gscv_fitted.cv_results_['mean_test_score'])

results_df.sort_values(by = 'scores', ascending = False).head()

25 fits failed out of a total of 75.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/base.py",

Unnamed: 0,linear_disc_analysis__shrinkage,linear_disc_analysis__solver,scores
10,0.5,lsqr,0.624279
11,0.5,eigen,0.624279
4,auto,lsqr,0.619765
5,auto,eigen,0.619765
1,,lsqr,0.614591


In [819]:
#LQA
parameters = {
    "quad_disc_analysis__reg_param": [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]}


gscv = GridSearchCV(lr_pipeline_quad_disc, parameters, cv = 5, scoring='f1_macro')

gscv_fitted = gscv.fit(X, y)

params_df = pd.DataFrame(gscv_fitted.cv_results_['params'])

results_df = params_df.assign(scores=gscv_fitted.cv_results_['mean_test_score'])

results_df.sort_values(by = 'scores', ascending = False).head()









Unnamed: 0,quad_disc_analysis__reg_param,scores
2,0.2,0.572959
4,0.4,0.568103
1,0.1,0.567455
3,0.3,0.55803
5,0.5,0.554586


In [820]:
#tune c
cs = {"support_vector_class__C": [.001, .01, .1, 1, 10, 100]}

gscv = GridSearchCV(lr_pipeline_support_vector_class, cs, cv = 5, scoring='f1_macro')

gscv_fitted = gscv.fit(X, y)

params_df = pd.DataFrame(gscv_fitted.cv_results_['params'])

results_df = params_df.assign(scores=gscv_fitted.cv_results_['mean_test_score'])

results_df.sort_values(by = 'scores', ascending = False)

Unnamed: 0,support_vector_class__C,scores
5,100.0,0.634547
4,10.0,0.618004
3,1.0,0.600145
2,0.1,0.577535
1,0.01,0.546142
0,0.001,0.172496


In [821]:
#SVM
cs = {"support_vector_machine__C": [.001, .01, .1, 1, 10, 100]}

gscv = GridSearchCV(lr_pipeline_support_vector_machine, cs, cv = 5, scoring='f1_macro')

gscv_fitted = gscv.fit(X, y)

params_df = pd.DataFrame(gscv_fitted.cv_results_['params'])

results_df = params_df.assign(scores=gscv_fitted.cv_results_['mean_test_score'])

results_df.sort_values(by = 'scores', ascending = False)

Unnamed: 0,support_vector_machine__C,scores
4,10.0,0.643677
5,100.0,0.639855
3,1.0,0.617877
0,0.001,0.172496
1,0.01,0.172496
2,0.1,0.172496


In [822]:
#KNN
ks = {"knn_classifier__n_neighbors": list(range(1, 26))}

gscv = GridSearchCV(knn_pipeline, ks, cv = 5, scoring='f1_macro')

gscv_fitted = gscv.fit(X, y)

params_df = pd.DataFrame(gscv_fitted.cv_results_['params'])

results_df = params_df.assign(scores=gscv_fitted.cv_results_['mean_test_score'])

results_df.sort_values(by = 'scores', ascending = False).head()

Unnamed: 0,knn_classifier__n_neighbors,scores
3,4,0.596128
9,10,0.593243
0,1,0.585612
10,11,0.578296
13,14,0.571349


**Prediction Submissions**

In [823]:
#test data
#test = pd.read_csv('data/CAH-201803-test.csv')

**test 1: using SVM with Drop Q5 + Q7 + Q12 + Q14 + Q16**

In [824]:
#variable selection
#X_test = test.drop(["Q5", "Q7", "Q12", "Q14", "Q16", "Q15", "Q13"], axis = 1)


#tuned SVM
lr_pipeline_support_vector_machine = Pipeline(
  [("preprocessing", ct),
  ("interaction", ct_inter),
  #use kernel = 'linear' like from practice activity, still want to tune C
  ("support_vector_machine", SVC(C = 10, kernel = 'rbf'))]
).set_output(transform="pandas")

#fit model
#lr_pipeline_support_vector_machine.fit(X, y)



# predict y's on X_test
final_predictions = pd.DataFrame(
    {"id_num": test['id_num'],
    "political_affiliation_predicted": lr_pipeline_support_vector_machine.predict(X_test)}
)

#export to csv
final_predictions.to_csv("classification_test_2.csv")

**test 2: using LDA**