In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering


In [3]:
# Load the dataset
df = pd.read_csv('cleaned_data.csv', parse_dates=['birthday', 'start_date', 'end_date'])

df


Unnamed: 0,full_name,pct,birthday,gender,party,senate_class,stage,cycle,start_date,end_date
0,Raphael G. Warnock,51.1,1969-07-23,M,Democrat,3.0,runoff,2022,2022-12-03,2022-12-05
1,Raphael G. Warnock,51.0,1969-07-23,M,Democrat,3.0,runoff,2022,2022-12-01,2022-12-05
2,Raphael G. Warnock,50.5,1969-07-23,M,Democrat,3.0,runoff,2022,2022-12-04,2022-12-04
3,Raphael G. Warnock,50.0,1969-07-23,M,Democrat,3.0,runoff,2022,2022-12-04,2022-12-04
4,Raphael G. Warnock,52.2,1969-07-23,M,Democrat,3.0,runoff,2022,2022-12-04,2022-12-04
...,...,...,...,...,...,...,...,...,...,...
3097,Benjamin L. Cardin,49.4,1943-10-05,M,Democrat,1.0,general,2018,2018-10-01,2018-10-06
3098,Benjamin L. Cardin,56.0,1943-10-05,M,Democrat,1.0,general,2018,2018-09-11,2018-09-16
3099,Thomas R. Carper,61.0,1947-01-23,M,Democrat,1.0,general,2018,2018-09-11,2018-09-17
3100,Thomas R. Carper,60.0,1947-01-23,M,Democrat,1.0,general,2018,2018-09-11,2018-09-17


In [1]:
# Calculate age from birthday
df['age'] = df['birthday'].apply(lambda x : (pd.datetime.now().year - x.year))


NameError: name 'df' is not defined

In [None]:
# Plot histogram of maximum age by full_name
df.groupby('full_name')['age'].max().hist()

In [5]:

# Generate random valence values as a placeholder
emotions = ['engagement', 'happiness', 'anticipation', 'excitement', 'confidence', 'pleasure', 'peace', 'disconnection', 'affection', 'esteem', 'sympathy']
df['top_emotion'] = np.random.choice(emotions, size=len(df))

df['valence'] = np.random.randint(0, 10, size=len(df))



peace            305
sympathy         301
happiness        291
excitement       285
affection        282
esteem           281
disconnection    280
anticipation     280
pleasure         276
confidence       266
engagement       255
Name: top_emotion, dtype: int64

In [None]:
# Display the count of each top_emotion
df['top_emotion'].value_counts()

In [6]:
## Final Feature Selection

target = ['pct'] # Define target, date-related, categorical, and numeric features for final feature selection
date_features = ['start_date', 'end_date'] # Is cycle really "date" or categorical?
categorical_features = ['top_emotion', 'gender', 'party', 'senate_class', 'stage'] # Categorical features
numeric_features = ['age', 'valence']  # Numeric features


In [7]:
### Train/Test Split"""

train, test = train_test_split(df, test_size=0.2)

X_train = train.drop(columns=target)
y_train = train[target]

X_test = test.drop(columns=target)
y_test = test[target]

In [8]:
# Normalize Numeric Features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

In [9]:
# Encode Categorical Features"""
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()

X_train_encoded = encoder.fit_transform(X_train[categorical_features])
X_test_encoded = encoder.transform(X_test[categorical_features])

In [16]:
X_train[numeric_features]

Unnamed: 0,age,valence
92,-0.795270,0.882174
198,-0.795270,1.236730
168,-0.795270,-0.181494
2807,0.837029,-1.245162
431,-0.795270,1.591287
...,...,...
1921,1.317117,-1.245162
1579,0.260923,0.882174
910,-0.891288,-0.181494
1330,-0.507218,0.173062


In [17]:
#Supervised Modeling
### Tree Methods
# Decision Tree (and Random Forest for ensembles week)
regressor = DecisionTreeRegressor(random_state=0)
cross_val_score(regressor, X_train[numeric_features], y_train, cv=10)

array([ 0.11677552,  0.03348785,  0.11207798,  0.10991904,  0.01313548,
        0.17534876,  0.02662745,  0.11926174,  0.03476799, -0.03144767])

In [32]:
# KNN Regressor
neigh = KNeighborsRegressor(n_neighbors=2)
neigh.fit(X_train[numeric_features], y_train)

[[48.1]]




In [None]:
# Predict using KNN for a given data point
print(neigh.predict([[1.5,1.0]])) 

# Unsupervised Modeling

In [22]:
#KMeans and Hierarchical Clustering


X = np.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]])
kmeans = KMeans(n_clusters=2, random_state=0, n_init="auto").fit(X)
kmeans.labels_
kmeans.predict([[0, 0], [12, 3]])
kmeans.cluster_centers_

array([[10.,  2.],
       [ 1.,  2.]])

In [23]:
# Hierarchical Clustering
X = np.array([[1, 2], [1, 4], [1, 0], [4, 2], [4, 4], [4, 0]])
clustering = AgglomerativeClustering().fit(X)
clustering.labels_

array([1, 1, 1, 0, 0, 0])

# Conclusion
## In our analysis, we explored the predictability of U.S. Senators' Job Approval Ratings (JAR) utilizing a curated "Persona Profile" integrating demographic information, party affiliation, professional roles, and Emotion scores derived from photographic data. Through clustering techniques, we investigated logical groupings based on emotional characteristics, probing potential associations with party lines, gender, and age. Additionally, we examined the impact of major historical events on Senators' JARs, focusing on a specific significant event to discern its influence. Furthermore, we assessed the effect of Senate composition, particularly the majority party, on Senators' JARs by analyzing a specific time period, offering valuable insights into political dynamics and public perception. Our findings suggest that Senators' JARs are influenced by a combination of factors, including emotional characteristics, party affiliation, and historical events, underscoring the complexity of public opinion and the importance of understanding these dynamics in political analysis.