# CAH Political Affiliation Classification
## Final Exam - GSB 544

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

## Data Loading

In [2]:
train_path = r"C:\Users\spink\OneDrive\Desktop\Machine Learning\Data\gsb-544-fall-2025-classification (1)\CAH-201803-train.csv"
test_path = r"C:\Users\spink\OneDrive\Desktop\Machine Learning\Data\gsb-544-fall-2025-classification (1)\CAH-201803-test.csv"

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

train.shape, test.shape

((169, 19), (166, 18))

In [3]:
train.head()

Unnamed: 0,id_num,Q1,Q2,political_affiliation,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18
0,1,Male,53,Independent,Liberal,College degree,Black,No,No,No,"Yes, somewhat religious",Pro-Choice,No,No,Behave no differently,5,2,5,No
1,5,Female,66,Independent,Conservative,Some college,White,Yes,No,Yes,"Yes, very religious",Pro-life,Yes,Yes,Less Willing,4,5,4,No
2,7,Female,58,Democrat,Liberal,College degree,White,No,No,No,"Yes, very religious",Pro-Choice,No,No,Behave no differently,5,1,4,Yes
3,8,Male,55,Independent,Moderate,High school or less,White,Yes,Yes,Yes,"Yes, somewhat religious",Pro-life,Yes,Yes,Less Willing,4,5,4,Yes
4,9,Male,64,Republican,Conservative,High school or less,White,Yes,Yes,Yes,No,Pro-life,No,No,Behave no differently,5,1,1,Yes


In [4]:
train['political_affiliation'].value_counts()

political_affiliation
Democrat       59
Independent    56
Republican     54
Name: count, dtype: int64

## Data Prep

In [5]:
X_train = train.drop(columns=['id_num', 'political_affiliation'])
y_train = train['political_affiliation']
X_test = test.drop(columns=['id_num'])

In [6]:
num_cols = ['Q2', 'Q15', 'Q16', 'Q17']
cat_cols = ['Q1', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12', 'Q13', 'Q14', 'Q18']

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), cat_cols)
])

## Model 1: SVC

In [7]:
svc_pipe = Pipeline([
    ('pre', preprocessor),
    ('svc', SVC(kernel='linear', random_state=42))
])

param_grid_svc = {'svc__C': [0.1, 1, 10]}
grid_svc = GridSearchCV(svc_pipe, param_grid_svc, cv=5, scoring='accuracy')
grid_svc.fit(X_train, y_train)

grid_svc.best_params_, grid_svc.best_score_

({'svc__C': 10}, np.float64(0.5919786096256684))

## Model 2: LDA

In [8]:
lda_pipe = Pipeline([
    ('pre', preprocessor),
    ('lda', LinearDiscriminantAnalysis())
])

lda_scores = cross_val_score(lda_pipe, X_train, y_train, cv=5, scoring='accuracy')
lda_scores.mean()

np.float64(0.5980392156862744)

## Model 3: Logistic Regression

In [9]:
lr_pipe = Pipeline([
    ('pre', preprocessor),
    ('lr', LogisticRegression(max_iter=1000, random_state=42))
])

param_grid_lr = {'lr__C': [0.01, 0.1, 1, 10]}
grid_lr = GridSearchCV(lr_pipe, param_grid_lr, cv=5, scoring='accuracy')
grid_lr.fit(X_train, y_train)

grid_lr.best_params_, grid_lr.best_score_

({'lr__C': 1}, np.float64(0.6213903743315508))

## Model Comparison

In [10]:
results = pd.DataFrame({
    'Model': ['SVC', 'LDA', 'LogReg'],
    'CV Accuracy': [grid_svc.best_score_, lda_scores.mean(), grid_lr.best_score_]
})
results

Unnamed: 0,Model,CV Accuracy
0,SVC,0.591979
1,LDA,0.598039
2,LogReg,0.62139


Logistic Regression preformed the best so I will use that for my final predictions.

## Final Predictions

In [11]:
best_model = grid_lr
predictions = best_model.predict(X_test)

In [12]:
submission = pd.DataFrame({
    'id_num': test['id_num'],
    'political_affiliation_predicted': predictions
})

submission.to_csv('submission.csv', index=False)
submission

Unnamed: 0,id_num,political_affiliation_predicted
0,2,Republican
1,3,Democrat
2,4,Democrat
3,6,Republican
4,11,Independent
...,...,...
161,327,Democrat
162,330,Independent
163,331,Democrat
164,333,Democrat
