In [None]:
# load packages
import pandas as pd
from prince import CA
import numpy as np
import seaborn as sns
from scipy.stats import pearsonr
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import skew, kurtosis

In [None]:
# load dataset with votes and topics
df_mvt = pd.read_csv('df_member_votes_fv_with_topics_3.tsv', sep='\t')

In [None]:
# calculate total votes per bill
df_mvt['total_votes'] = df_mvt['yea_count'] + df_mvt['nay_count']

# create column with value of minority casted vote
df_mvt['minority_vote'] = df_mvt[['yea_count', 'nay_count']].min(axis=1)

# calculate proportion of minority vote
df_mvt['minority_percentage'] = (df_mvt['minority_vote'] / df_mvt['total_votes']) * 100

# remove votes on bills where minority vote has less than 2.5%
df_mvt = df_mvt[df_mvt['minority_percentage'] >= 2.5]

In [None]:
# create subsets based on topics
defense = df_mvt[df_mvt['topic'] == 'defense and military']
environment = df_mvt[df_mvt['topic'] == 'environmental and natural resources']
congress = df_mvt[df_mvt['topic'] == 'government budget and administration']
infrastructure = df_mvt[df_mvt['topic'] == 'infrastructure and development']
foreign = df_mvt[df_mvt['topic'] == 'international relations and government']
legislation = df_mvt[df_mvt['topic'] == 'legislation and policy']
social = df_mvt[df_mvt['topic'] == 'social services and public welfare']

In [None]:
# per topic remove members if they voted 20 or less times
value_counts = defense['nameparty_id'].value_counts()
defense = defense[defense['nameparty_id'].isin(value_counts[value_counts >= 20].index)]
value_counts = environment['nameparty_id'].value_counts()
environment = environment[environment['nameparty_id'].isin(value_counts[value_counts >= 20].index)]
value_counts = congress['nameparty_id'].value_counts()
congress = congress[congress['nameparty_id'].isin(value_counts[value_counts >= 20].index)]
value_counts = infrastructure['nameparty_id'].value_counts()
infrastructure = infrastructure[infrastructure['nameparty_id'].isin(value_counts[value_counts >= 20].index)]
value_counts = foreign['nameparty_id'].value_counts()
foreign = foreign[foreign['nameparty_id'].isin(value_counts[value_counts >= 20].index)]
value_counts = legislation['nameparty_id'].value_counts()
legislation = legislation[legislation['nameparty_id'].isin(value_counts[value_counts >= 20].index)]
value_counts = social['nameparty_id'].value_counts()
social = social[social['nameparty_id'].isin(value_counts[value_counts >= 20].index)]

In [None]:
# create new column with yea vote being 1 and rest being 0
defense['yea_vote'] = defense['cast_code'].apply(lambda x: 1 if x in [1, 2, 3] else 0)
environment['yea_vote'] = environment['cast_code'].apply(lambda x: 1 if x in [1, 2, 3] else 0)
congress['yea_vote'] = congress['cast_code'].apply(lambda x: 1 if x in [1, 2, 3] else 0)
infrastructure['yea_vote'] = infrastructure['cast_code'].apply(lambda x: 1 if x in [1, 2, 3] else 0)
foreign['yea_vote'] = foreign['cast_code'].apply(lambda x: 1 if x in [1, 2, 3] else 0)
legislation['yea_vote'] = legislation['cast_code'].apply(lambda x: 1 if x in [1, 2, 3] else 0)
social['yea_vote'] = social['cast_code'].apply(lambda x: 1 if x in [1, 2, 3] else 0)

In [None]:
# create matrices for each topic with nameparty_id and bill_id as variables
defense_matrix = pd.crosstab(defense['nameparty_id'], defense['bill_id'])
environment_matrix = pd.crosstab(environment['nameparty_id'], environment['bill_id'])
congress_matrix = pd.crosstab(congress['nameparty_id'], congress['bill_id'])
infrastructure_matrix = pd.crosstab(infrastructure['nameparty_id'], infrastructure['bill_id'])
foreign_matrix = pd.crosstab(foreign['nameparty_id'], foreign['bill_id'])
legislation_matrix = pd.crosstab(legislation['nameparty_id'], legislation['bill_id'])
social_matrix = pd.crosstab(social['nameparty_id'], social['bill_id'])

# function that creates contingency matrix for subset, where value is 1 if yea is 1
def contingency_matrix(df, contingency_matrix):
    for i, row in df.iterrows():
        if row['yea_vote'] == 1:  
            contingency_matrix.at[row['nameparty_id'], row['bill_id']] = 1
        else:  
            contingency_matrix.at[row['nameparty_id'], row['bill_id']] = 0
    return contingency_matrix

# create contingency matrices for topics, where 1 is put between bill_id and nameparty_id if member voted yea on bill
defense_matrix = contingency_matrix(defense, defense_matrix)
environment_matrix = contingency_matrix(environment, environment_matrix)
congress_matrix = contingency_matrix(congress, congress_matrix)
infrastructure_matrix = contingency_matrix(infrastructure, infrastructure_matrix)
foreign_matrix = contingency_matrix(foreign, foreign_matrix)
legislation_matrix = contingency_matrix(legislation, legislation_matrix)
social_matrix = contingency_matrix(social, social_matrix)

In [None]:
# save rows where all values are not zero
non_zero_rows = (defense_matrix != 0).any(axis=1)
# save rows where all values are zero
zero_rows = (defense_matrix == 0).all(axis=1)
# save matrix where all rows have all zero values
zero_rows = defense_matrix.loc[zero_rows]
# save rows where all values are not zero
non_zero_cols = (defense_matrix != 0).any(axis=0)

# keep rows where not all values are zero, as zero rows do not get accepted by Correspondence Analysis
d_matrix = defense_matrix.loc[non_zero_rows, non_zero_cols]

# create and fit model for defense topic with 20 components
model_d = CA(n_components=20, random_state=41)
model_d.fit(d_matrix)

In [None]:
# save eigenvalues of defense CA model
eigenvalues = model_d.eigenvalues_

# create scree plot of eigenvalues
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(eigenvalues) + 1), eigenvalues, marker='o', linestyle='--')
plt.title('Scree Plot - Defense')
plt.xlabel('Dimension')
plt.ylabel('Eigenvalue')
plt.xticks(range(1, len(eigenvalues) + 1))
plt.grid(True)
plt.show()

In [None]:
# put all indices of rows where all values are zero into list
zero_rows = zero_rows.index.tolist()
# create a set consisting of the values found both in the nameparty_id column and the indices list
intersection = set(defense['nameparty_id']).intersection(zero_rows)
# remove nameparty_ids out of defense matrix which are also in intersection set
filter_defense = defense[~defense['nameparty_id'].isin(intersection)]

In [None]:
# keep only important of defense matrix and sort by nameparty_id
cols = ['party', 'nameparty_id']
defense = filter_defense[cols]
defense = defense.drop_duplicates()
defense = defense.sort_values(by='nameparty_id')

# save coordinates of members
row_coordinates = model_d.row_coordinates(d_matrix)

# extract first dimension coordinates
dim1_scores_names = row_coordinates[0]

In [None]:
# save rows where all values are not zero
non_zero_rows = (environment_matrix != 0).any(axis=1)
# save rows where all values are zero
zero_rows = (environment_matrix == 0).all(axis=1)
# save matrix where all rows have all zero values
zero_rows = environment_matrix.loc[zero_rows]
# save columns where all values are not zero
non_zero_cols = (environment_matrix != 0).any(axis=0)

# keep rows where not all values are zero, as zero rows do not get accepted by Correspondence Analysis
e_matrix = environment_matrix.loc[non_zero_rows, non_zero_cols]

# create and fit model for environment topic with 20 components
model_e = CA(n_components=20)
model_e.fit(e_matrix)

In [None]:
# save eigenvalues of environment CA model
eigenvalues = model_e.eigenvalues_

# create scree plot of eigenvalues
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(eigenvalues) + 1), eigenvalues, marker='o', linestyle='--')
plt.title('Scree Plot - Environment')
plt.xlabel('Dimensions')
plt.ylabel('Eigenvalue')
plt.xticks(range(1, len(eigenvalues) + 1))
plt.grid(True)
plt.show()

In [None]:
# put all indices of rows where all values are zero into list
zero_rows = zero_rows.index.tolist()
# create a set consisting of the values found both in the nameparty_id column and the indices list
intersection = set(environment['nameparty_id']).intersection(zero_rows)
# remove nameparty_ids out of environment matrix which are also in intersection set
filter_environment = environment[~environment['nameparty_id'].isin(intersection)]

In [None]:
# keep only important of environment matrix and sort by nameparty_id
cols = ['party', 'nameparty_id']
environment = filter_environment[cols]
environment = environment.drop_duplicates()
environment = environment.sort_values(by='nameparty_id')

# save coordinates of members
row_coordinates = model_e.row_coordinates(e_matrix)

# extracting the first dimension
dim1_scores_names = row_coordinates[0]

In [None]:
# save rows where all values are not zero
non_zero_rows = (congress_matrix != 0).any(axis=1)
# save rows where all values are zero
zero_rows = (congress_matrix == 0).all(axis=1)
# save matrix where all rows have all zero values
zero_rows = congress_matrix.loc[zero_rows]
# save columns where all values are not zero
non_zero_cols = (congress_matrix != 0).any(axis=0)

# keep rows where not all values are zero, as zero rows do not get accepted by Correspondence Analysis
c_matrix = congress_matrix.loc[non_zero_rows, non_zero_cols]

# create and fit model for government budget topic with 20 components
model_c = CA(n_components=20)
model_c.fit(c_matrix)

In [None]:
# save eigenvalues of government budget CA model
eigenvalues = model_c.eigenvalues_

# create scree plot of eigenvalues
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(eigenvalues) + 1), eigenvalues, marker='o', linestyle='--')
plt.title('Scree Plot')
plt.xlabel('Dimensions')
plt.ylabel('Eigenvalue')
plt.xticks(range(1, len(eigenvalues) + 1))
plt.grid(True)
plt.show()

In [None]:
# put all indices of rows where all values are zero into list
zero_rows = zero_rows.index.tolist()
# create a set consisting of the values found both in the nameparty_id column and the indices list
intersection = set(congress['nameparty_id']).intersection(zero_rows)
# remove nameparty_ids out of government budget matrix which are also in intersection set
filter_congress = congress[~congress['nameparty_id'].isin(intersection)]

In [None]:
# keep only important of government budget matrix and sort by nameparty_id
cols = ['party', 'nameparty_id']
congress = filter_congress[cols]
congress = congress.drop_duplicates()
congress = congress.sort_values(by='nameparty_id')

# save coordinates of members
row_coordinates = model_c.row_coordinates(c_matrix)

# extracting the first dimension
dim1_scores_names = row_coordinates[0]

In [None]:
# save rows where all values are not zero
non_zero_rows = (infrastructure_matrix != 0).any(axis=1)
# save rows where all values are zero
zero_rows = (infrastructure_matrix == 0).all(axis=1)
# save matrix where all rows have all zero values
zero_rows = infrastructure_matrix.loc[zero_rows]
# save columns where all values are not zero
non_zero_cols = (infrastructure_matrix != 0).any(axis=0)

# keep rows where not all values are zero, as zero rows do not get accepted by Correspondence Analysis
i_matrix = infrastructure_matrix.loc[non_zero_rows, non_zero_cols]

# create and fit model for infrastructure topic with 20 components
model_i = CA(n_components=20)
model_i.fit(i_matrix)

In [None]:
# save eigenvalues of infrastructure CA model
eigenvalues = model_i.eigenvalues_

# create scree plot of eigenvalues
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(eigenvalues) + 1), eigenvalues, marker='o', linestyle='--')
plt.title('Scree Plot')
plt.xlabel('Dimensions')
plt.ylabel('Eigenvalue')
plt.xticks(range(1, len(eigenvalues) + 1))
plt.grid(True)
plt.show()

In [None]:
# put all indices of rows where all values are zero into list
zero_rows = zero_rows.index.tolist()
# create a set consisting of the values found both in the nameparty_id column and the indices list
intersection = set(infrastructure['nameparty_id']).intersection(zero_rows)
# remove nameparty_ids out of infrastructure matrix which are also in intersection set
filter_infra = infrastructure[~infrastructure['nameparty_id'].isin(intersection)]

In [None]:
# keep only important of infrastructure matrix and sort by nameparty_id
cols = ['party', 'nameparty_id']
infra = filter_infra[cols]
infra = infra.drop_duplicates()
infra = infra.sort_values(by='nameparty_id')

# save coordinates of members
row_coordinates = model_i.row_coordinates(i_matrix)

# extracting the first dimension
dim1_scores_names = row_coordinates[0]

In [None]:
# save rows where all values are not zero
non_zero_rows = (foreign_matrix != 0).any(axis=1)
# save rows where all values are zero
zero_rows = (foreign_matrix == 0).all(axis=1)
# save matrix where all rows have all zero values
zero_rows = foreign_matrix.loc[zero_rows]
# save columns where all values are not zero
non_zero_cols = (foreign_matrix != 0).any(axis=0)

# keep rows where not all values are zero, as zero rows do not get accepted by Correspondence Analysis
f_matrix = foreign_matrix.loc[non_zero_rows, non_zero_cols]

# create and fit model for international relations topic with 20 components
model_f = CA(n_components=20)
model_f.fit(f_matrix)

In [None]:
# save eigenvalues of international relations CA model
eigenvalues = model_f.eigenvalues_

# create scree plot of eigenvalues
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(eigenvalues) + 1), eigenvalues, marker='o', linestyle='--')
plt.title('Scree Plot')
plt.xlabel('Dimensions')
plt.ylabel('Eigenvalue')
plt.xticks(range(1, len(eigenvalues) + 1))
plt.grid(True)
plt.show()

In [None]:
# put all indices of rows where all values are zero into list
zero_rows = zero_rows.index.tolist()
# create a set consisting of the values found both in the nameparty_id column and the indices list
intersection = set(foreign['nameparty_id']).intersection(zero_rows)
# remove nameparty_ids out of international relations matrix which are also in intersection set
filter_foreign = foreign[~foreign['nameparty_id'].isin(intersection)]

In [None]:
# keep only important columns of international relations matrix and sort by nameparty_id
cols = ['party', 'nameparty_id']
foreign = filter_foreign[cols]
foreign = foreign.drop_duplicates()
foreign = foreign.sort_values(by='nameparty_id')

# save coordinates of members
row_coordinates = model_f.row_coordinates(f_matrix)

# extracting the first dimension
dim1_scores_names = row_coordinates[0]

In [None]:
# save rows where all values are not zero
non_zero_rows = (legislation_matrix != 0).any(axis=1)
# save rows where all values are zero
zero_rows = (legislation_matrix == 0).all(axis=1)
# save matrix where all rows have all zero values
zero_rows = legislation_matrix.loc[zero_rows]
# save columns where all values are not zero
non_zero_cols = (legislation_matrix != 0).any(axis=0)

# keep rows where not all values are zero, as zero rows do not get accepted by Correspondence Analysis
l_matrix = legislation_matrix.loc[non_zero_rows, non_zero_cols]

# create and fit model for legislation topic with 20 components
model_l = CA(n_components=20)
model_l.fit(l_matrix)

In [None]:
# save eigenvalues of legislation CA model
eigenvalues = model_l.eigenvalues_

# create scree plot of eigenvalues
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(eigenvalues) + 1), eigenvalues, marker='o', linestyle='--')
plt.title('Scree Plot - Legislation')
plt.xlabel('Dimensions')
plt.ylabel('Eigenvalue')
plt.xticks(range(1, len(eigenvalues) + 1))
plt.grid(True)
plt.show()

In [None]:
# put all indices of rows where all values are zero into list
zero_rows = zero_rows.index.tolist()
# create a set consisting of the values found both in the nameparty_id column and the indices list
intersection = set(legislation['nameparty_id']).intersection(zero_rows)
# remove nameparty_ids out of legislation matrix which are also in intersection set
filter_legis = legislation[~legislation['nameparty_id'].isin(intersection)]

In [None]:
# keep only important columns of legislation matrix and sort by nameparty_id
cols = ['party', 'nameparty_id']
legis = filter_legis[cols]
legis = legis.drop_duplicates()
legis = legis.sort_values(by='nameparty_id')

# save coordinates of members
row_coordinates = model_l.row_coordinates(l_matrix)

# extracting the first dimension
dim1_scores_names = row_coordinates[0]

In [None]:
# save rows where all values are not zero
non_zero_rows = (social_matrix != 0).any(axis=1)
# save rows where all values are zero
zero_rows = (social_matrix == 0).all(axis=1)
# save matrix where all rows have all zero values
zero_rows = social_matrix.loc[zero_rows]
# save columns where all values are not zero
non_zero_cols = (social_matrix != 0).any(axis=0)

# keep rows where not all values are zero, as zero rows do not get accepted by Correspondence Analysis
s_matrix = social_matrix.loc[non_zero_rows, non_zero_cols]

# create and fit model for social services topic with 20 components
model_s = CA(n_components=20)
model_s.fit(s_matrix)

In [None]:
# save eigenvalues of social services CA model
eigenvalues = model_s.eigenvalues_

# create scree plot of eigenvalues
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(eigenvalues) + 1), eigenvalues, marker='o', linestyle='--')
plt.title('Scree Plot - Social Services')
plt.xlabel('Dimensions')
plt.ylabel('Eigenvalue')
plt.xticks(range(1, len(eigenvalues) + 1))
plt.grid(True)
plt.show()

In [None]:
# put all indices of rows where all values are zero into list
zero_rows = zero_rows.index.tolist()
# create a set consisting of the values found both in the nameparty_id column and the indices list
intersection = set(social['nameparty_id']).intersection(zero_rows)
# remove nameparty_ids out of social matrix which are also in intersection set
filter_social = social[~social['nameparty_id'].isin(intersection)]

In [None]:
# keep only important columns of social services matrix and sort by nameparty_id
cols = ['party', 'nameparty_id']
social = filter_social[cols]
social = social.drop_duplicates()
social = social.sort_values(by='nameparty_id')

# save coordinates of members
row_coordinates = model_s.row_coordinates(s_matrix)

# extracting the first dimension
dim1_scores_names = row_coordinates[0]

In [None]:
# save eigenvalues of all topics
eigenvalues_s = model_s.eigenvalues_
eigenvalues_f = model_f.eigenvalues_
eigenvalues_c = model_c.eigenvalues_
eigenvalues_d = model_d.eigenvalues_
eigenvalues_e = model_e.eigenvalues_
eigenvalues_i = model_i.eigenvalues_
eigenvalues_l = model_l.eigenvalues_

# create list of eigenvalues 
eigenvalues_set = [eigenvalues_e, eigenvalues_i, eigenvalues_c, eigenvalues_d, eigenvalues_f, eigenvalues_l, eigenvalues_s]

# create subplot
fig, axs = plt.subplots(4, 2, figsize=(10, 13))

# create list of topic names
names = ['Environmental and Natural Resources', 'Infrastructure and Development', 'Government Budget and Administration', 'Defense and Military', 'International Relations and Government', 'Legislation and Policy', 'Social Services and Public Welfare']

axs = axs.flatten()

# loop through topics and create subplot of scree plots
for i, topic in enumerate(eigenvalues_set):

    df = pd.DataFrame({'Component': range(1, len(topic) + 1),
                       'Eigenvalue': topic})

    sns.lineplot(ax=axs[i], data=df, x='Component', y='Eigenvalue', marker='o')
    axs[i].set_title(names[i])
    axs[i].set_xlabel('Dimensions')
    axs[i].set_ylabel('Eigenvalue')

# remove plot from subplot if empty
for j in range(len(eigenvalues_set), 4 * 2):
    fig.delaxes(axs[j])

plt.tight_layout()

plt.savefig('screeplots_topics.png')

plt.show()

In [None]:
# save cumulative variance per topic
eigenvalues_s = model_s.cumulative_percentage_of_variance_
eigenvalues_f = model_f.cumulative_percentage_of_variance_
eigenvalues_c = model_c.cumulative_percentage_of_variance_
eigenvalues_d = model_d.cumulative_percentage_of_variance_
eigenvalues_e = model_e.cumulative_percentage_of_variance_
eigenvalues_i = model_i.cumulative_percentage_of_variance_
eigenvalues_l = model_l.cumulative_percentage_of_variance_

# create list of cumulative variance
eigenvalues_set = [eigenvalues_e, eigenvalues_i, eigenvalues_c, eigenvalues_d, eigenvalues_f, eigenvalues_l, eigenvalues_s]

# set palette
palette = sns.color_palette('deep', len(eigenvalues_set))

# create list of topics
names = ['environment', 'infrastructure', 'government', 'defense', 'international', 'legislation', 'social services']

# plot each dataset's cumulative variance in the same figure
for i, (eigenvalues, name) in enumerate(zip(eigenvalues_set, names)):
    df = pd.DataFrame({
        'Component': range(1, len(eigenvalues) + 1),
        'Eigenvalue': eigenvalues
    })
    
    sns.lineplot(data=df, x='Component', y='Eigenvalue', marker='o', label=name, color=palette[i])

# edd plot title and labels
plt.xlabel('Principal Component')
plt.ylabel('Cumulative Variance')

plt.xticks(np.arange(0, max(len(ev) for ev in eigenvalues_set) + 1, 2))
plt.legend(loc='upper left')
plt.tight_layout()
plt.grid(True)

plt.savefig('combined_variance_topics.png')

plt.show()

## NOMINATE & CA

### DEFENSE

In [None]:
# load dataset with NOMINATE values for each member per topic
df_nom = pd.read_csv('df_member_topic_nominate_4.tsv', sep='\t')

In [None]:
# save first dimension coordinates for topic and create dataset with NOMINATE and CA values 
def_nom = df_nom[df_nom['topic'] == 'defense and military']

cols = ['party', 'nameparty_id', 'NOM1D']
def_nom = def_nom[cols]

row_coordinates = model_d.row_coordinates(d_matrix)

def_ca = row_coordinates[0]
def_ca = pd.DataFrame(def_ca)

df_def = pd.merge(def_nom, def_ca, on=['nameparty_id'], how='left')

df_def = df_def.drop_duplicates()
df_def = df_def.sort_values(by='nameparty_id')
df_def = df_def.dropna()

# make sure Republicans are more likely to have positive value
df_def[0] = df_def[0] * -1

In [None]:
# normalize CA values
scaler = MinMaxScaler(feature_range=(-1, 1))
df_def['ca_normal'] = scaler.fit_transform(df_def[[1]])

In [None]:
# remove not important party members
df_def = df_def.loc[~df_def['party'].isin(['Independent', 'Independent Democrat'])]

In [None]:
# create scatter-and-density-plot of CA and NOMINATE values
custom_palette = {'Republican Party': 'red', 'Democratic Party': 'blue'}
sns.jointplot(x="NOM1D", y="ca_normal", hue="party", data=df_def, palette=custom_palette)

legend = plt.legend()
legend.set_title(None)
plt.xlabel('NOMINATE')
plt.ylabel('CA')
plt.title('Defense')

plt.tight_layout()
plt.savefig('scatter_def.png')

plt.show()

### ENVIRONMENTAL

In [None]:
# save first dimension coordinates for topic and create dataset with NOMINATE and CA values 
env_nom = df_nom[df_nom['topic'] == 'environmental and natural resources']

cols = ['party', 'nameparty_id', 'NOM1D']
env_nom = env_nom[cols]

row_coordinates = model_e.row_coordinates(e_matrix)

env_ca = row_coordinates[0]
env_ca = pd.DataFrame(env_ca)

df_env = pd.merge(env_nom, env_ca, on=['nameparty_id'], how='left')

df_env = df_env.drop_duplicates()
df_env = df_env.sort_values(by='nameparty_id')

df_env = df_env.dropna()

# make sure Republicans are more likely to have positive value
df_env[0] = df_env[0] * -1

df_env

In [None]:
# normalize CA values
scaler = MinMaxScaler(feature_range=(-1, 1))
df_env['ca_normal'] = scaler.fit_transform(df_env[[1]])

In [None]:
# remove not important party members
df_env = df_env.loc[~df_env['party'].isin(['Independent', 'Independent Democrat'])]

In [None]:
# create scatter-and-density-plot of CA and NOMINATE values
sns.jointplot(x="NOM1D", y="ca_normal", hue="party", data=df_env, palette=custom_palette)

legend = plt.legend()
legend.set_title(None)
plt.xlabel('NOMINATE')
plt.ylabel('CA')
plt.title('Environment')

plt.tight_layout()
plt.savefig('scatter_env.png')

plt.show()

### GOVERNMENT BUDGET

In [None]:
# save first dimension coordinates for topic and create dataset with NOMINATE and CA values 
con_nom = df_nom[df_nom['topic'] == 'government budget and administration']

cols = ['party', 'nameparty_id', 'NOM1D']
con_nom = con_nom[cols]

row_coordinates = model_c.row_coordinates(c_matrix)

con_ca = row_coordinates[0]
con_ca = pd.DataFrame(con_ca)

df_con = pd.merge(con_nom, con_ca, on=['nameparty_id'], how='left')

df_con = df_con.drop_duplicates()
df_con = df_con.sort_values(by='nameparty_id')

df_con = df_con.dropna()

df_con

In [None]:
# remove not important party members
df_con = df_con.loc[~df_con['party'].isin(['Independent', 'Independent Democrat'])]

In [None]:
# normalize CA values
scaler = MinMaxScaler(feature_range=(-1, 1))
df_con['ca_normal'] = scaler.fit_transform(df_con[[0]])

In [None]:
# create scatter-and-density-plot of CA and NOMINATE values
sns.jointplot(x="NOM1D", y="ca_normal", hue="party", data=df_con, palette=custom_palette)

legend = plt.legend()
legend.set_title(None)
plt.xlabel('NOMINATE')
plt.ylabel('CA')
plt.title('Government Budget')

plt.tight_layout()
plt.savefig('scatter_con.png')

plt.show()

### INFRASTRUCTURE

In [None]:
# save first dimension coordinates for topic and create dataset with NOMINATE and CA values 
inf_nom = df_nom[df_nom['topic'] ==  'infrastructure and development']

cols = ['party', 'nameparty_id', 'NOM1D']
inf_nom = inf_nom[cols]

row_coordinates = model_i.row_coordinates(i_matrix)

inf_ca = row_coordinates[0]
inf_ca = pd.DataFrame(inf_ca)

df_inf = pd.merge(inf_nom, inf_ca, on=['nameparty_id'], how='left')

df_inf = df_inf.drop_duplicates()
df_inf = df_inf.sort_values(by='nameparty_id')

df_inf = df_inf.dropna()

# make sure Republicans are more likely to have positive value
df_inf[0] = df_inf[0] * -1

In [None]:
# normalize CA values
scaler = MinMaxScaler(feature_range=(-1, 1))
df_inf['ca_normal'] = scaler.fit_transform(df_inf[[0]])

In [None]:
# remove not important party members
df_inf = df_inf.loc[~df_inf['party'].isin(['Independent', 'Independent Democrat'])]

In [None]:
# create scatter-and-density-plot of CA and NOMINATE values
sns.jointplot(x="NOM1D", y="ca_normal", hue="party", data=df_inf, palette=custom_palette)

legend = plt.legend()
legend.set_title(None)
plt.xlabel('NOMINATE')
plt.ylabel('CA')
plt.title('Infrastructure')

plt.tight_layout()
plt.savefig('scatter_inf.png')

plt.show()

### INTERNATIONAL RELATIONS

In [None]:
# save first dimension coordinates for topic and create dataset with NOMINATE and CA values 
for_nom = df_nom[df_nom['topic'] ==  'international relations and government']

cols = ['party', 'nameparty_id', 'NOM1D']
for_nom = for_nom[cols]

row_coordinates = model_f.row_coordinates(f_matrix)

for_ca = row_coordinates[0]
for_ca = pd.DataFrame(for_ca)

df_for = pd.merge(for_nom, for_ca, on=['nameparty_id'], how='left')

df_for = df_for.drop_duplicates()
df_for = df_for.sort_values(by='nameparty_id')

df_for = df_for.dropna()

In [None]:
# normalize CA values
scaler = MinMaxScaler(feature_range=(-1, 1))
df_for['ca_normal'] = scaler.fit_transform(df_for[[0]])

In [None]:
# remove not important party members
df_for = df_for.loc[~df_for['party'].isin(['Independent', 'Independent Democrat'])]

In [None]:
# create scatter-and-density-plot of CA and NOMINATE values
sns.jointplot(x="NOM1D", y="ca_normal", hue="party", data=df_for, palette=custom_palette)

legend = plt.legend()
legend.set_title(None)
plt.xlabel('NOMINATE')
plt.ylabel('CA')
plt.title('International Relations')

plt.tight_layout()
plt.savefig('scatter_for.png')

plt.show()

### LEGISLATION

In [None]:
# save first dimension coordinates for topic and create dataset with NOMINATE and CA values 
leg_nom = df_nom[df_nom['topic'] ==  'legislation and policy']

cols = ['party', 'nameparty_id', 'NOM1D']
leg_nom = leg_nom[cols]

row_coordinates = model_l.row_coordinates(l_matrix)

leg_ca = row_coordinates[0]
leg_ca = pd.DataFrame(leg_ca)

df_leg = pd.merge(leg_nom, leg_ca, on=['nameparty_id'], how='left')

df_leg = df_leg.drop_duplicates()
df_leg = df_leg.sort_values(by='nameparty_id')

df_leg = df_leg.dropna()

In [None]:
# normalize CA values
scaler = MinMaxScaler(feature_range=(-1, 1))
df_leg['ca_normal'] = scaler.fit_transform(df_leg[[0]])

In [None]:
# remove not important party members
df_leg = df_leg.loc[~df_leg['party'].isin(['Independent', 'Independent Democrat'])]

In [None]:
# create scatter-and-density-plot of CA and NOMINATE values
sns.jointplot(x="NOM1D", y="ca_normal", hue="party", data=df_leg, palette=custom_palette)

legend = plt.legend()
legend.set_title(None)
plt.xlabel('NOMINATE')
plt.ylabel('CA')
plt.title('Legislation')

plt.tight_layout()
plt.savefig('scatter_leg.png')

plt.show()

### SOCIAL SERVICES

In [None]:
# save first dimension coordinates for topic and create dataset with NOMINATE and CA values 
soc_nom = df_nom[df_nom['topic'] ==  'social services and public welfare']

cols = ['party', 'nameparty_id', 'NOM1D']
soc_nom = soc_nom[cols]

row_coordinates = model_s.row_coordinates(s_matrix)

soc_ca = row_coordinates[0]
soc_ca = pd.DataFrame(soc_ca)

df_soc = pd.merge(soc_nom, soc_ca, on=['nameparty_id'], how='left')

df_soc = df_soc.drop_duplicates()
df_soc = df_soc.sort_values(by='nameparty_id')

df_soc = df_soc.dropna()

# make sure Republicans are more likely to have positive value
df_soc[0] = df_soc[0] * -1

In [None]:
# normalize CA values
scaler = MinMaxScaler(feature_range=(-1, 1))
df_soc['ca_normal'] = scaler.fit_transform(df_soc[[0]])

In [None]:
# remove not important party members
df_soc = df_soc.loc[~df_soc['party'].isin(['Independent', 'Independent Democrat'])]

In [None]:
# create scatter-and-density-plot of CA and NOMINATE values
sns.jointplot(x="NOM1D", y="ca_normal", hue="party", data=df_soc, palette=custom_palette)

legend = plt.legend()
legend.set_title(None)
plt.xlabel('NOMINATE')
plt.ylabel('CA')
plt.title('Social Services')

plt.tight_layout()
plt.savefig('scatter_soc.png')

plt.show()

In [None]:
# function for calculating Bimodality Coefficient
def bimodality_coefficient(data):
    n = len(data)
    skewness = skew(data)
    excess_kurtosis = kurtosis(data)  # Excess kurtosis (kurtosis - 3)
    bc = (skewness**2 + 1) / (excess_kurtosis + 3*(n-1)**2 / ((n-2)*(n-3)))
    return bc

In [None]:
# calculate bimodality coefficient of CA values per topic
print(bimodality_coefficient(df_def['ca_normal']))
print(bimodality_coefficient(df_env['ca_normal']))
print(bimodality_coefficient(df_con['ca_normal']))
print(bimodality_coefficient(df_inf['ca_normal']))
print(bimodality_coefficient(df_for['ca_normal']))
print(bimodality_coefficient(df_soc['ca_normal']))
print(bimodality_coefficient(df_leg['ca_normal']))

In [None]:
# calculate bimodality coefficient of NOMINATE values per topic
print(bimodality_coefficient(df_def['NOM1D']))
print(bimodality_coefficient(df_env['NOM1D']))
print(bimodality_coefficient(df_con['NOM1D']))
print(bimodality_coefficient(df_inf['NOM1D']))
print(bimodality_coefficient(df_for['NOM1D']))
print(bimodality_coefficient(df_soc['NOM1D']))
print(bimodality_coefficient(df_leg['NOM1D']))

In [None]:
# create column with topic names
df_def['DataFrame'] = 'Defense'
df_env['DataFrame'] = 'Environment'
df_con['DataFrame'] = 'Government Budget'
df_inf['DataFrame'] = 'Infrastructure'
df_for['DataFrame'] = 'International Relations'
df_leg['DataFrame'] = 'Legislation'
df_soc['DataFrame'] = 'Social Services'

# create two datasets consisting of topic subsets
combined_df_1 = pd.concat([df_env, df_inf, df_con, df_def])
combined_df_2 = pd.concat([df_for, df_leg, df_soc])

# create violin density boxplots of first four topics
plt.figure(figsize=(10, 6))
sns.violinplot(x='DataFrame', y='ca_normal', hue='party', data=combined_df_1, palette=custom_palette)

legend = plt.legend()
legend.set_title(None)
plt.xlabel('Topics')
plt.ylabel('CA')

plt.tight_layout()
plt.savefig('violin_boxplots_1.png')

plt.show()

# create violin density boxplots of first three topics
plt.figure(figsize=(10, 6))
sns.violinplot(x='DataFrame', y='ca_normal', hue='party', data=combined_df_2, palette=custom_palette)

legend = plt.legend()
legend.set_title(None)
plt.xlabel('Topics')
plt.ylabel('CA')

plt.tight_layout()
plt.savefig('violin_boxplots_2.png')

plt.show()

In [None]:
# keep relevant columns from NOMINATE dataset
cols = ['congress', 'nameparty_id']
df_congress = df_nom[cols]
df_congress = df_congress.drop_duplicates()

In [None]:
# create dataset of congress and CA values
df_congress_con = pd.merge(df_congress, df_con, on='nameparty_id', how='left')
df_congress_con = df_congress_con.dropna()
df_congress_con

In [None]:
# calculate both party's mean and standard deviation CA value for each topic per congress
agg_values = df_congress_con.groupby(['congress', 'party'])['ca_normal'].agg(['mean', 'std']).reset_index()
agg_values.rename(columns={'mean': 'mean_ca', 'std': 'std_ca'}, inplace=True)

# create pivot table with mean as values
pivot_avg_values_c = agg_values.pivot(index='congress', columns='party', values='mean_ca').reset_index()
# create pivot table with std as values
pivot_std_values_c = agg_values.pivot(index='congress', columns='party', values='std_ca').reset_index()

# calculate absolute difference
pivot_avg_values_c['diff'] = (pivot_avg_values_c['Republican Party'] - pivot_avg_values_c['Democratic Party']).abs()

# combine mean and std dataframes
pivot_avg_values_c['Republican_std'] = pivot_std_values_c['Republican Party']
pivot_avg_values_c['Democratic_std'] = pivot_std_values_c['Democratic Party']

# create line plot of mean spatial locations of both parties and difference
plt.figure(figsize=(10, 6))

# plot Republican line with shaded error band
sns.lineplot(data=pivot_avg_values_c, x='congress', y='Republican Party', marker='^', color='red', label='Republican')
plt.fill_between(
    pivot_avg_values_c['congress'],
    pivot_avg_values_c['Republican Party'] - pivot_avg_values_c['Republican_std'],
    pivot_avg_values_c['Republican Party'] + pivot_avg_values_c['Republican_std'],
    color='red', alpha=0.1
)

# plot Democratic line with shaded error band
sns.lineplot(data=pivot_avg_values_c, x='congress', y='Democratic Party', marker='o', color='blue', label='Democratic')
plt.fill_between(
    pivot_avg_values_c['congress'],
    pivot_avg_values_c['Democratic Party'] - pivot_avg_values_c['Democratic_std'],
    pivot_avg_values_c['Democratic Party'] + pivot_avg_values_c['Democratic_std'],
    color='blue', alpha=0.1
)

# plot difference
sns.lineplot(data=pivot_avg_values_c, x='congress', y='diff', marker='s', color='grey', linestyle='--', label='Difference'
)

# add labels and title
plt.xlabel('Congress')
plt.ylabel('Average CA')
plt.title('Average CA per Party per Year with Difference - Government Budget')
plt.grid(True)
plt.legend()

plt.show()

In [None]:
# create dataset of congress and NOMINATE and CA values
df_congress_leg = pd.merge(df_congress, df_leg, on='nameparty_id', how='left')
df_congress_leg = df_congress_leg.dropna()
df_congress_leg

In [None]:
# calculate both party's mean and standard deviation CA value for each topic per congress
agg_values = df_congress_leg.groupby(['congress', 'party'])['ca_normal'].agg(['mean', 'std']).reset_index()
agg_values.rename(columns={'mean': 'mean_ca', 'std': 'std_ca'}, inplace=True)

# create pivot table with mean as values
pivot_avg_values_l = agg_values.pivot(index='congress', columns='party', values='mean_ca').reset_index()
# create pivot table with std as values
pivot_std_values_l = agg_values.pivot(index='congress', columns='party', values='std_ca').reset_index()

# calculate absolute difference
pivot_avg_values_l['diff'] = (pivot_avg_values_l['Republican Party'] - pivot_avg_values_l['Democratic Party']).abs()

# combine mean and std dataframes
pivot_avg_values_l['Republican_std'] = pivot_std_values_l['Republican Party']
pivot_avg_values_l['Democratic_std'] = pivot_std_values_l['Democratic Party']

# create line plot of mean spatial locations of both parties and difference
plt.figure(figsize=(10, 6))

# plot Republican line with shaded error band
sns.lineplot(data=pivot_avg_values_l, x='congress', y='Republican Party', marker='^', color='red', label='Republican')
plt.fill_between(
    pivot_avg_values_l['congress'],
    pivot_avg_values_l['Republican Party'] - pivot_avg_values_l['Republican_std'],
    pivot_avg_values_l['Republican Party'] + pivot_avg_values_l['Republican_std'],
    color='red', alpha=0.1
)

# plot Democratic line with shaded error band
sns.lineplot(data=pivot_avg_values_l, x='congress', y='Democratic Party', marker='o', color='blue', label='Democratic')
plt.fill_between(
    pivot_avg_values_l['congress'],
    pivot_avg_values_l['Democratic Party'] - pivot_avg_values_l['Democratic_std'],
    pivot_avg_values_l['Democratic Party'] + pivot_avg_values_l['Democratic_std'],
    color='blue', alpha=0.1
)

# plot difference
sns.lineplot(data=pivot_avg_values_l, x='congress', y='diff', marker='s', color='grey', linestyle='--', label='Difference'
)

# add labels and title
plt.xlabel('Congress')
plt.ylabel('Average CA')
plt.title('Average CA per Party per Year with Difference - Legislation')
plt.grid(True)
plt.legend()

plt.show()

In [None]:
# create dataset of congress and NOMINATE and CA values
df_congress_soc = pd.merge(df_congress, df_soc, on='nameparty_id', how='left')
df_congress_soc = df_congress_soc.dropna()
df_congress_soc

In [None]:
# calculate both party's mean and standard deviation CA value for each topic per congress
agg_values = df_congress_soc.groupby(['congress', 'party'])['ca_normal'].agg(['mean', 'std']).reset_index()
agg_values.rename(columns={'mean': 'mean_ca', 'std': 'std_ca'}, inplace=True)

# create pivot table with mean as values
pivot_avg_values_s = agg_values.pivot(index='congress', columns='party', values='mean_ca').reset_index()
# create pivot table with std as values
pivot_std_values_s = agg_values.pivot(index='congress', columns='party', values='std_ca').reset_index()

# calculate absolute difference
pivot_avg_values_s['diff'] = (pivot_avg_values_s['Republican Party'] - pivot_avg_values_s['Democratic Party']).abs()

# combine mean and std dataframes
pivot_avg_values_s['Republican_std'] = pivot_std_values_s['Republican Party']
pivot_avg_values_s['Democratic_std'] = pivot_std_values_s['Democratic Party']

# create line plot of mean spatial locations of both parties and difference
plt.figure(figsize=(10, 6))

# plot Republican line with shaded error band
sns.lineplot(data=pivot_avg_values_s, x='congress', y='Republican Party', marker='^', color='red', label='Republican')
plt.fill_between(
    pivot_avg_values_s['congress'],
    pivot_avg_values_s['Republican Party'] - pivot_avg_values_s['Republican_std'],
    pivot_avg_values_s['Republican Party'] + pivot_avg_values_s['Republican_std'],
    color='red', alpha=0.1
)

# plot Democratic line with shaded error band
sns.lineplot(data=pivot_avg_values_s, x='congress', y='Democratic Party', marker='o', color='blue', label='Democratic')
plt.fill_between(
    pivot_avg_values_s['congress'],
    pivot_avg_values_s['Democratic Party'] - pivot_avg_values_s['Democratic_std'],
    pivot_avg_values_s['Democratic Party'] + pivot_avg_values_s['Democratic_std'],
    color='blue', alpha=0.1
)

# plot difference
sns.lineplot(data=pivot_avg_values_s, x='congress', y='diff', marker='s', color='grey', linestyle='--', label='Difference'
)

# add labels and title
plt.xlabel('Congress')
plt.ylabel('Average CA')
plt.title('Average CA per Party per Year with Difference - Social Services')
plt.grid(True)
plt.legend()

plt.show()

In [None]:
# create dataset of congress and NOMINATE and CA values
df_congress_for = pd.merge(df_congress, df_for, on='nameparty_id', how='left')
df_congress_for = df_congress_for.dropna()
df_congress_for

In [None]:
# calculate both party's mean and standard deviation CA value for each topic per congress
agg_values = df_congress_for.groupby(['congress', 'party'])['ca_normal'].agg(['mean', 'std']).reset_index()
agg_values.rename(columns={'mean': 'mean_ca', 'std': 'std_ca'}, inplace=True)

# create pivot table with mean as values
pivot_avg_values_f = agg_values.pivot(index='congress', columns='party', values='mean_ca').reset_index()
# create pivot table with std as values
pivot_std_values_f = agg_values.pivot(index='congress', columns='party', values='std_ca').reset_index()

# calculate absolute difference
pivot_avg_values_f['diff'] = (pivot_avg_values_f['Republican Party'] - pivot_avg_values_f['Democratic Party']).abs()

# combine mean and std dataframes
pivot_avg_values_f['Republican_std'] = pivot_std_values_f['Republican Party']
pivot_avg_values_f['Democratic_std'] = pivot_std_values_f['Democratic Party']

# create line plot of mean spatial locations of both parties and difference
plt.figure(figsize=(10, 6))

# plot Republican line with shaded error band
sns.lineplot(data=pivot_avg_values_f, x='congress', y='Republican Party', marker='^', color='red', label='Republican')
plt.fill_between(
    pivot_avg_values_f['congress'],
    pivot_avg_values_f['Republican Party'] - pivot_avg_values_f['Republican_std'],
    pivot_avg_values_f['Republican Party'] + pivot_avg_values_f['Republican_std'],
    color='red', alpha=0.1
)

# plot Democratic line with shaded error band
sns.lineplot(data=pivot_avg_values_f, x='congress', y='Democratic Party', marker='o', color='blue', label='Democratic')
plt.fill_between(
    pivot_avg_values_f['congress'],
    pivot_avg_values_f['Democratic Party'] - pivot_avg_values_f['Democratic_std'],
    pivot_avg_values_f['Democratic Party'] + pivot_avg_values_f['Democratic_std'],
    color='blue', alpha=0.1
)

# plot difference
sns.lineplot(data=pivot_avg_values_f, x='congress', y='diff', marker='s', color='grey', linestyle='--', label='Difference'
)

# add labels and title
plt.xlabel('Congress')
plt.ylabel('Average CA')
plt.title('Average CA per Party per Year with Difference - International Relations')
plt.grid(True)
plt.legend()

plt.show()

In [None]:
# create dataset of congress and NOMINATE and CA values
df_congress_def = pd.merge(df_congress, df_def, on='nameparty_id', how='left')
df_congress_def = df_congress_def.dropna()
df_congress_def

In [None]:
# calculate both party's mean and standard deviation CA value for each topic per congress
agg_values = df_congress_def.groupby(['congress', 'party'])['ca_normal'].agg(['mean', 'std']).reset_index()
agg_values.rename(columns={'mean': 'mean_ca', 'std': 'std_ca'}, inplace=True)

# create pivot table with mean as values
pivot_avg_values_d = agg_values.pivot(index='congress', columns='party', values='mean_ca').reset_index()
# create pivot table with std as values
pivot_std_values_d = agg_values.pivot(index='congress', columns='party', values='std_ca').reset_index()

# calculate absolute difference
pivot_avg_values_d['diff'] = (pivot_avg_values_d['Republican Party'] - pivot_avg_values_d['Democratic Party']).abs()

# combine mean and std dataframes
pivot_avg_values_d['Republican_std'] = pivot_std_values_d['Republican Party']
pivot_avg_values_d['Democratic_std'] = pivot_std_values_d['Democratic Party']

# create line plot of mean spatial locations of both parties and difference
plt.figure(figsize=(10, 6))

# plot Republican line with shaded error band
sns.lineplot(data=pivot_avg_values_d, x='congress', y='Republican Party', marker='^', color='red', label='Republican')
plt.fill_between(
    pivot_avg_values_d['congress'],
    pivot_avg_values_d['Republican Party'] - pivot_avg_values_d['Republican_std'],
    pivot_avg_values_d['Republican Party'] + pivot_avg_values_d['Republican_std'],
    color='red', alpha=0.1
)

# plot Democratic line with shaded error band
sns.lineplot(data=pivot_avg_values_d, x='congress', y='Democratic Party', marker='o', color='blue', label='Democratic')
plt.fill_between(
    pivot_avg_values_d['congress'],
    pivot_avg_values_d['Democratic Party'] - pivot_avg_values_d['Democratic_std'],
    pivot_avg_values_d['Democratic Party'] + pivot_avg_values_d['Democratic_std'],
    color='blue', alpha=0.1
)

# plot difference
sns.lineplot(data=pivot_avg_values_d, x='congress', y='diff', marker='s', color='grey', linestyle='--', label='Difference'
)

# add labels and title
plt.xlabel('Congress')
plt.ylabel('Average CA')
plt.title('Average CA per Party per Year with Difference - Defense')
plt.grid(True)
plt.legend()

plt.show()

In [None]:
# create dataset of congress and NOMINATE and CA values
df_congress_inf = pd.merge(df_congress, df_inf, on='nameparty_id', how='left')
df_congress_inf = df_congress_inf.dropna()
df_congress_inf

In [None]:
# calculate both party's mean and standard deviation CA value for each topic per congress
agg_values = df_congress_inf.groupby(['congress', 'party'])['ca_normal'].agg(['mean', 'std']).reset_index()
agg_values.rename(columns={'mean': 'mean_ca', 'std': 'std_ca'}, inplace=True)

# create pivot table with mean as values
pivot_avg_values_i = agg_values.pivot(index='congress', columns='party', values='mean_ca').reset_index()
# create pivot table with std as values
pivot_std_values_i = agg_values.pivot(index='congress', columns='party', values='std_ca').reset_index()

# calculate absolute difference
pivot_avg_values_i['diff'] = (pivot_avg_values_i['Republican Party'] - pivot_avg_values_i['Democratic Party']).abs()

# combine mean and std dataframes
pivot_avg_values_i['Republican_std'] = pivot_std_values_i['Republican Party']
pivot_avg_values_i['Democratic_std'] = pivot_std_values_i['Democratic Party']

# create line plot of mean spatial locations of both parties and difference
plt.figure(figsize=(10, 6))

# plot Republican line with shaded error band
sns.lineplot(data=pivot_avg_values_i, x='congress', y='Republican Party', marker='^', color='red', label='Republican')
plt.fill_between(
    pivot_avg_values_i['congress'],
    pivot_avg_values_i['Republican Party'] - pivot_avg_values_i['Republican_std'],
    pivot_avg_values_i['Republican Party'] + pivot_avg_values_i['Republican_std'],
    color='red', alpha=0.1
)

# plot Democratic line with shaded error band
sns.lineplot(data=pivot_avg_values_i, x='congress', y='Democratic Party', marker='o', color='blue', label='Democratic')
plt.fill_between(
    pivot_avg_values_i['congress'],
    pivot_avg_values_i['Democratic Party'] - pivot_avg_values_i['Democratic_std'],
    pivot_avg_values_i['Democratic Party'] + pivot_avg_values_i['Democratic_std'],
    color='blue', alpha=0.1
)

# plot difference
sns.lineplot(data=pivot_avg_values_i, x='congress', y='diff', marker='s', color='grey', linestyle='--', label='Difference'
)

# add labels and title
plt.xlabel('Congress')
plt.ylabel('Average CA')
plt.title('Average CA per Party per Year with Difference - Infrastructure')
plt.grid(True)
plt.legend()

plt.show()

In [None]:
# create dataset of congress and NOMINATE and CA values
df_congress_env = pd.merge(df_congress, df_env, on='nameparty_id', how='left')
df_congress_env = df_congress_env.dropna()
df_congress_env

In [None]:
# calculate both party's mean and standard deviation CA value for each topic per congress
agg_values = df_congress_env.groupby(['congress', 'party'])['ca_normal'].agg(['mean', 'std']).reset_index()
agg_values.rename(columns={'mean': 'mean_ca', 'std': 'std_ca'}, inplace=True)

# create pivot table with mean as values
pivot_avg_values_e = agg_values.pivot(index='congress', columns='party', values='mean_ca').reset_index()
# create pivot table with std as values
pivot_std_values_e = agg_values.pivot(index='congress', columns='party', values='std_ca').reset_index()

# calculate absolute difference
pivot_avg_values_e['diff'] = (pivot_avg_values_e['Republican Party'] - pivot_avg_values_e['Democratic Party']).abs()

# combine mean and std dataframes
pivot_avg_values_e['Republican_std'] = pivot_std_values_e['Republican Party']
pivot_avg_values_e['Democratic_std'] = pivot_std_values_e['Democratic Party']

# create line plot of mean spatial locations of both parties and difference
plt.figure(figsize=(10, 6))

# plot Republican line with shaded error band
sns.lineplot(data=pivot_avg_values_e, x='congress', y='Republican Party', marker='^', color='red', label='Republican')
plt.fill_between(
    pivot_avg_values_e['congress'],
    pivot_avg_values_e['Republican Party'] - pivot_avg_values_e['Republican_std'],
    pivot_avg_values_e['Republican Party'] + pivot_avg_values_e['Republican_std'],
    color='red', alpha=0.1
)

# plot Democratic line with shaded error band
sns.lineplot(data=pivot_avg_values_e, x='congress', y='Democratic Party', marker='o', color='blue', label='Democratic')
plt.fill_between(
    pivot_avg_values_e['congress'],
    pivot_avg_values_e['Democratic Party'] - pivot_avg_values_e['Democratic_std'],
    pivot_avg_values_e['Democratic Party'] + pivot_avg_values_e['Democratic_std'],
    color='blue', alpha=0.1
)

# plot difference
sns.lineplot(data=pivot_avg_values_e, x='congress', y='diff', marker='s', color='grey', linestyle='--', label='Difference'
)

# add labels and title
plt.xlabel('Congress')
plt.ylabel('Average CA')
plt.title('Average CA per Party per Year with Difference - Environment')
plt.grid(True)
plt.legend()

plt.show()

In [None]:
# calculate Pearson Correlation Coefficient of NOMINATE and CA spatial locations of legislators per topic
x = df_soc['NOM1D']
y = df_soc['ca_normal']
correlation_s, p_value_s = pearsonr(x, y)

x = df_for['NOM1D']
y = df_for['ca_normal']
correlation_f, p_value_f = pearsonr(x, y)

x = df_con['NOM1D']
y = df_con['ca_normal']
correlation_c, p_value_c = pearsonr(x, y)

x = df_inf['NOM1D']
y = df_inf['ca_normal']
correlation_i, p_value_i = pearsonr(x, y)

x = df_env['NOM1D']
y = df_env['ca_normal']
correlation_e, p_value_e = pearsonr(x, y)

x = df_def['NOM1D']
y = df_def['ca_normal']
correlation_d, p_value_d = pearsonr(x, y)

x = df_leg['NOM1D']
y = df_leg['ca_normal']
correlation_l, p_value_l = pearsonr(x, y)

print("\nSocial Correlation")
print(f"Pearson correlation coefficient: {correlation_s}")
print(f"P-value: {p_value_s}")

print("\nForeign Correlation")
print(f"Pearson correlation coefficient: {correlation_f}")
print(f"P-value: {p_value_f}")

print("\nCongress Correlation")
print(f"Pearson correlation coefficient: {correlation_c}")
print(f"P-value: {p_value_c}")

print("\nInfrastructure Correlation")
print(f"Pearson correlation coefficient: {correlation_i}")
print(f"P-value: {p_value_i}")

print("\nDefense Correlation")
print(f"Pearson correlation coefficient: {correlation_d}")
print(f"P-value: {p_value_d}")

print("\nEnvironment Correlation")
print(f"Pearson correlation coefficient: {correlation_e}")
print(f"P-value: {p_value_e}")

print("\nLegislation Correlation")
print(f"Pearson correlation coefficient: {correlation_l}")
print(f"P-value: {p_value_l}")

In [None]:
# create list of pivot tables
dfs = [pivot_avg_values_e, pivot_avg_values_i, pivot_avg_values_c, pivot_avg_values_d, pivot_avg_values_f, pivot_avg_values_l, pivot_avg_values_s]
dfs

In [None]:
# create subplot
fig, axs = plt.subplots(4, 2, figsize=(10, 13))

# create list of topic names
names = ['Environmental and Natural Resources', 'Infrastructure and Development', 'Government Budget and Administration', 'Defense and Military', 'International Relations and Government', 'Legislation and Policy', 'Social Services and Public Welfare']

axs = axs.flatten()

# loop through topics and create plot per topic
for i, topic in enumerate(dfs):

    # plot Republican Party and error band
    sns.lineplot(data=topic, x='congress', y='Republican Party', ax=axs[i], marker='^', color='red', label='R')
    axs[i].fill_between(
        topic['congress'],
        topic['Republican Party'] - topic['Republican_std'],
        topic['Republican Party'] + topic['Republican_std'],
        color='red', alpha=0.1
    )

    # plot Democratic Party and error band
    sns.lineplot(data=topic, x='congress', y='Democratic Party', ax=axs[i], marker='o', color='blue', label='D')
    axs[i].fill_between(
        topic['congress'],
        topic['Democratic Party'] - topic['Democratic_std'],
        topic['Democratic Party'] + topic['Democratic_std'],
        color='blue', alpha=0.1
    )

    # plot difference
    sns.lineplot(data=topic, x='congress', y='diff', marker='s', ax=axs[i], color='grey', linestyle='--', label='Diff'
    )


    axs[i].set_xlabel('Congress')
    axs[i].set_ylabel('Mean CA')
    axs[i].set_title(names[i])
    axs[i].legend()

# remove plot from subplot if empty
for j in range(len(dfs), 4 * 2):
    fig.delaxes(axs[j])

plt.tight_layout()

plt.savefig('polarization_ca_topics.png')

plt.show()

In [None]:
# remove unnecessary columns from pivot table
pivot_avg = pivot_avg_values_d.rename_axis(columns='topic')
columns_to_drop = ['Democratic Party', 'Republican Party', 'Republican_std', 'Democratic_std']
pivot_avg = pivot_avg.drop(columns=columns_to_drop)
pivot_avg

In [None]:
# add difference of other topics to pivot table
pivot_avg = pivot_avg.rename(columns={'diff': 'defense'})
pivot_avg['environment'] = pivot_avg_values_e['diff']
pivot_avg['government'] = pivot_avg_values_c['diff']
pivot_avg['infrastructure'] = pivot_avg_values_i['diff']
pivot_avg['international'] = pivot_avg_values_f['diff']
pivot_avg['legislation'] = pivot_avg_values_l['diff']
pivot_avg['social services'] = pivot_avg_values_s['diff']
pivot_avg

In [None]:
# order topics
new_order = ['congress', 'environment', 'infrastructure', 'government', 'defense', 'international', 'legislation', 'social services']
pivot_avg = pivot_avg[new_order]

In [None]:
# form pivot table in needed table
df_melted = pd.melt(pivot_avg, id_vars='congress', var_name='topic', value_name='difference')

# set palette
palette = sns.color_palette('deep')

# create lineplot of differences per congress for each topic
sns.lineplot(data=df_melted, x='congress', y='difference', hue='topic', palette=palette, marker='o')

legend = plt.legend()
legend.set_title(None)
plt.xlabel('Congress')
plt.ylabel('Mean CA Difference')
plt.grid(True)
plt.savefig('difference_topics_ca.png')
plt.show()