In [1]:
import numpy as np
import pandas as pd
import sklearn.cluster
from itertools import combinations
import math
from bokeh.io import show, output_file,output_notebook, push_notebook
from bokeh.plotting import figure
from bokeh.models import GraphRenderer, StaticLayoutProvider, Circle, MultiLine
from bokeh.palettes import Spectral8
output_notebook()

# Read Data
https://voteview.com/static/docs/csv_docs.html#member

In [10]:
url_members="https://voteview.com/static/data/out/members/Sall_members.csv"
url_votes="https://voteview.com/static/data/out/votes/Sall_votes.csv"

In [11]:
members=pd.read_csv(url_members)
members=members[members["chamber"]=="Senate"]
members=members[members["congress"]>=95]
votes= pd.read_csv(url_votes)
votes=votes[votes["chamber"]=="Senate"]
votes=votes[votes["congress"]>=95]

# Preprocessing

In [12]:
def cast_code(x):
    if x<=3:
        return 1
    elif x<=6:
        return 0
    else:
        return 0.5
def party_code(x):
    if x==100:
        return "Democrat"
    elif x==200:
        return "Republican"
    elif x==328:
        return "Independent"
    else:
        return "Unknown:"+str(x)
votes["cast_code"]=votes["cast_code"].apply(cast_code)
members["party_code"]=members["party_code"].apply(party_code)

In [13]:
index=(members["party_code"]!="Democrat")&(members["party_code"]!="Republican")&(members["party_code"]!="Independent")
members[index]

Unnamed: 0,congress,chamber,icpsr,state_icpsr,district_code,state_abbrev,party_code,occupancy,last_means,bioname,...,died,nominate_dim1,nominate_dim2,nominate_log_likelihood,nominate_geo_mean_probability,nominate_number_of_votes,nominate_number_of_errors,conditional,nokken_poole_dim1,nokken_poole_dim2


In [14]:
votes.head()

Unnamed: 0,congress,chamber,rollnumber,icpsr,cast_code,prob
2705134,95,Senate,1,660,1.0,
2705135,95,Senate,1,1252,1.0,
2705136,95,Senate,1,1366,1.0,
2705137,95,Senate,1,1482,1.0,
2705138,95,Senate,1,1569,1.0,


In [15]:
def senator_df(congress_num):
    member=members[members["congress"]==congress_num]
    vote=votes[votes["congress"]==congress_num]
    df=pd.DataFrame([])
    roll=vote["rollnumber"].unique()
    map(lambda l: "roll_"+l,roll)
    n=len(vote["rollnumber"].unique())
    for i in vote.groupby("icpsr"):
        df1=i[1]
        sr=df1["icpsr"].unique()[0]
        cast_list=df1["cast_code"].values.tolist()
        #for those senator have full data, append directly
        if len(cast_list)==n:
            df2=pd.DataFrame(np.array([sr]+cast_list).reshape((1,-1)))
            df=df.append(df2)
        #for those senator do not have full data, use 0.5 for missing roll number
        else:
            cast_list=[sr]+(0.5*np.ones(n)).tolist()
            for j in df1["rollnumber"]:
                cast_list[j]=df1[df1["rollnumber"]==j]["cast_code"].values
            df2=pd.DataFrame(np.array(cast_list).reshape((1,-1)))
            df=df.append(df2)
    df.columns=["icpsr"]+list(map(lambda l: "roll_"+str(l),roll))
    df["icpsr"]=df["icpsr"].apply(int)
    df=df.merge(members[members["congress"]==congress_num][["icpsr","party_code"]],how="left",on="icpsr")
    return df.copy()

# senator_plot

In [16]:
def senator_plot(congress_num):
    df=senator_df(congress_num)
    kmeans_model = sklearn.cluster.KMeans(n_clusters=2).fit(df.iloc[:,1:-1])
    labels = kmeans_model.labels_
    print("clustering result")
    print(pd.crosstab(labels,df["party_code"]))
    df=df[(df["party_code"]=="Democrat")|(df["party_code"]=="Republican")]
    N=df.shape[0]
    D=df.shape[1]-2
    node_indices = list(range(N))
    pca_2=sklearn.decomposition.PCA(2)
    plot_columns = pca_2.fit_transform(df.iloc[:,1:-1])
    x_max,y_max=np.max(plot_columns,axis=0)
    x_min,y_min=np.min(plot_columns,axis=0)
    plot = figure(title="Senator Plot of Congress "+str(congress_num), x_range=(x_min-1,x_max+1), y_range=(y_min-1,y_max+1),
                  tools="", toolbar_location=None)
    graph = GraphRenderer()
    graph.node_renderer.glyph = Circle(fill_color="fill_color")
    #calculate node_color
    def party_color(x):
        if x=="Democrat":
            return Spectral8[0]
        else:
            return Spectral8[7]
    node_color=df["party_code"].apply(party_color).values.tolist()
    graph.node_renderer.data_source.data = dict(index=node_indices,fill_color=node_color)

    #calculate edge index and edge color
    def party_color2(x,y):
        if x=="Democrat" and y=="Democrat":
            return Spectral8[0] #blue
        elif x=="Republican" and y=="Republican":
            return Spectral8[7] #red
        else:
            return Spectral8[5] #yellow for lines between D and R
    start=[]
    end=[]
    edge_color=[]
    for (x,y) in combinations(df["icpsr"].values,2):
        similar_ratio=np.sum((df[df["icpsr"]==x].iloc[:,1:-1].values-df[df["icpsr"]==y].iloc[:,1:-1].values)==0)/D
        if similar_ratio>0.6:
            start.append(x)
            edge_color.append(party_color2(df[df["icpsr"]==x]["party_code"].values[0],df[df["icpsr"]==y]["party_code"].values[0]))
            end.append(y)
    senator_list=df["icpsr"].unique()
    start_index=list(map(lambda l: senator_list.tolist().index(l),start))
    end_index=list(map(lambda l: senator_list.tolist().index(l),end))

    #plot edge
    graph.edge_renderer.glyph = MultiLine(line_width=0.5,line_color="line_color")
    graph.edge_renderer.data_source.data = dict(start=start_index,end=end_index,line_color=edge_color)
    x=plot_columns[:,0]
    y=plot_columns[:,1]
    graph_layout = dict(zip(node_indices, zip(x, y)))
    graph.layout_provider = StaticLayoutProvider(graph_layout=graph_layout)
    plot.renderers.append(graph)
    #output_file("graph.html")
    show(plot)

Bokeh

In [17]:
#Congress 100 (1987-1989)
senator_plot(100) 

clustering result
party_code  Democrat  Republican
row_0                           
0                 54           9
1                  1          37


E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: fill_color [renderer: GlyphRenderer(id='e7338304-8994-473c-b15c-27519a8fc7fc', ...)]
E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: line_color [renderer: GlyphRenderer(id='feebb319-a5b0-4364-984e-9bfa1d286010', ...)]


In [10]:
#Congress 105 (1997-1999)
senator_plot(105) 

clustering result
party_code  Democrat  Republican
row_0                           
0                  0          54
1                 45           1


E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: fill_color [renderer: GlyphRenderer(id='2279a7a1-1973-47ab-b378-f3e692f58c0b', ...)]
E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: line_color [renderer: GlyphRenderer(id='3e40aba6-2cfa-4b6c-a893-761347d2fce8', ...)]


In [11]:
#Congress 110 (2007-2009)
senator_plot(110) 

clustering result
party_code  Democrat  Independent  Republican
row_0                                        
0                  0            0          50
1                 50            1           1


E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: fill_color [renderer: GlyphRenderer(id='c8950600-d7c4-4aa5-9818-bfd16b142bad', ...)]
E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: line_color [renderer: GlyphRenderer(id='24bad4ec-3fb4-4c3e-a074-c9b2a695bbd5', ...)]


In [13]:
#Congress 113 (2013-2015)
senator_plot(113) 

clustering result
party_code  Democrat  Independent  Republican
row_0                                        
0                 55            2           2
1                  2            0          44


E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: fill_color [renderer: GlyphRenderer(id='006bf01b-3bc9-497b-bcdb-d70c59c7dde3', ...)]
E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: line_color [renderer: GlyphRenderer(id='87e4aab4-e5c9-4543-b990-c1542279ab36', ...)]


In [12]:
#Congress 115 (2017-2019)
senator_plot(115) 

clustering result
party_code  Democrat  Independent  Republican
row_0                                        
0                 48            2           1
1                  0            0          52


E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: fill_color [renderer: GlyphRenderer(id='1d59b044-afba-4ecf-9dea-7c557c076dec', ...)]
E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: line_color [renderer: GlyphRenderer(id='dd451d3b-6b38-4165-b551-7f19de0abccf', ...)]


From the above plots, we can see that the there are fewer and fewer yellow lines between senators in Democrat and Republican as time goes by. It shows that these two parties are getting more and more diverce in recent years. 