In [None]:
# HIDDEN
import warnings
# Ignore numpy dtype warnings. These warnings are caused by an interaction
# between numpy and Cython and can be safely ignored.
# Reference: https://stackoverflow.com/a/40846742
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual

from pathlib import Path
from ds100_utils import fetch_and_cache
from datetime import datetime
from IPython.display import display

import yaml

sns.set()
sns.set_context('talk')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option('display.max_rows', 7)
pd.set_option('display.max_columns', 8)
pd.set_option('precision', 2)
# This option stops scientific notation for pandas
# pd.set_option('display.float_format', '{:.2f}'.format)

In [None]:
# Sam's special methods to display large DataFrames
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
def df_interact(df, nrows=7, ncols=7):
    '''
    Outputs sliders that show rows and columns of df
    '''
    def peek(row=0, col=0):
        return df.iloc[row:row + nrows, col:col + ncols]

    row_arg = (0, len(df), nrows) if len(df) > nrows else fixed(0)
    col_arg = ((0, len(df.columns), ncols)
               if len(df.columns) > ncols else fixed(0))
    
    interact(peek, row=row_arg, col=col_arg)
    print('({} rows, {} columns) total'.format(df.shape[0], df.shape[1]))

def display_df(df, rows=pd.options.display.max_rows,
               cols=pd.options.display.max_columns):
    with pd.option_context('display.max_rows', rows,
                           'display.max_columns', cols):
        display(df)

In [None]:
def plot_vec(x_dir, y_dir):
    plt.quiver(0, 0, x_dir, y_dir, scale_units='xy', angles='xy',
               scale=0.1, color='red')

## PCA Intuition: Going from 2D to 1D

In [None]:
# Downloads from https://www.gapminder.org/data/
cm_path = 'child_mortality_0_5_year_olds_dying_per_1000_born.csv'
fe_path = 'children_per_woman_total_fertility.csv'
cm = pd.read_csv(cm_path).set_index('country')['2017'].to_frame()/10
fe = pd.read_csv(fe_path).set_index('country')['2017'].to_frame()
data = cm.merge(fe, left_index=True, right_index=True).dropna()
data.columns = ['mortality', 'fertility']
data.head()

In [None]:
def scatter():
    sns.scatterplot('mortality', 'fertility', data=data)
    plt.xlim([0, 14])
    plt.ylim([0, 14])
    plt.xticks(np.arange(0, 14, 2))
    plt.yticks(np.arange(0, 14, 2))


scatter()

In [None]:
x, y = data['mortality'], data['fertility']
slope_x, intercept_x = np.polyfit(x, y, 1) # simple linear regression

scatter()
plt.plot(x, slope_x * x + intercept_x)
for _, row in data.sample(20).iterrows():
    x, y = row['mortality'], row['fertility']
    plt.plot([x, x], [slope_x * x + intercept_x, y], c='red')

In [None]:
x, y = data['mortality'], data['fertility']
slope_y, intercept_y = np.polyfit(y, x, 1) # simple linear regression

scatter()
plt.plot(x, slope_x * x + intercept_x)
plt.plot(slope_y * y + intercept_y, y)
for _, row in data.sample(20).iterrows():
    x, y = row['mortality'], row['fertility']
    plt.plot([x, slope_y * y + intercept_y], [y, y], c='red')

In [None]:
from sklearn.decomposition import PCA

plt.figure(figsize=(10, 10))

D = data.values
means = np.mean(D, axis=0)
first_pc = PCA(n_components=1).fit(D).components_[0]

# Find the projection of each point onto the 
#   first principal component (first_pc).
line = np.outer((D - means) @ first_pc, first_pc) + means

x, y = data['mortality'], data['fertility']
scatter()
plt.plot(x, slope_x * x + intercept_x) # regression line for x
plt.plot(slope_y * y + intercept_y, y) # regression line for y
plt.plot(line[:, 0], line[:, 1])       # principal component line

for i, row in data.reset_index().sample(20).iterrows():
    x, y = row['mortality'], row['fertility']
    xp, yp = line[i, :]
    plt.plot([x, xp], [y, yp], c='red')

## PCA on Legislator Votes

In [None]:
# From Lecture 4
base_url = 'https://github.com/unitedstates/congress-legislators/raw/master/'
legislators_path = 'legislators-current.yaml'
f = fetch_and_cache(base_url + legislators_path, legislators_path)
legislators_data = yaml.safe_load(open(f))

def to_date(s):
    return datetime.strptime(s, '%Y-%m-%d')

legs = pd.DataFrame(
    columns=['leg_id', 'first', 'last', 'gender', 'state', 'chamber', 'party', 'birthday'],
    data=[[x['id']['bioguide'], 
           x['name']['first'],
           x['name']['last'],
           x['bio']['gender'],
           x['terms'][-1]['state'],
           x['terms'][-1]['type'],
           x['terms'][-1]['party'],
           to_date(x['bio']['birthday'])] for x in legislators_data])

legs.head(3)

In [None]:
# February 2019 House of Representatives roll call votes
# Downloaded using https://github.com/eyeseast/propublica-congress
# See the lec09 house votes notebook for details.
votes = pd.read_csv('votes.csv')
votes.head()

In [None]:
votes.merge(legs, left_on='member', right_on='leg_id').sample(5)

In [None]:
yes_no = votes[votes['vote'] == 'Yes']
vote_pivot = yes_no.pivot_table(index='member', 
                                columns='roll call', 
                                values='vote', 
                                aggfunc=len, 
                                fill_value=0)
print(vote_pivot.shape)
vote_pivot.head()

## Select Attributes

In [None]:
np.var(vote_pivot, axis=0).sort_values()

In [None]:
sns.scatterplot(69, 80, data=vote_pivot);

In [None]:
sns.scatterplot(69, 80, data=vote_pivot + np.random.random(vote_pivot.shape) * 0.1);

## PCA

In [None]:
vote2d = pd.DataFrame({
    'member': vote_pivot.index,
    'pc1': t[:, 0],
    'pc2': t[:, 1]
}).merge(legs, left_on='member', right_on='leg_id')

...

In [None]:
plt.figure(figsize=(10, 8))
sns.scatterplot(x='pc1', y='pc2', hue='party',
                hue_order=['Democrat', 'Republican', 'Independent'],
                data=vote2d);

What's going on here?

In [None]:
special_id = 'P000197'
...

In [None]:
df = votes[votes['member'].isin(vote2d[vote2d['pc2'] < -0.1]['member'])]
display_df(df.groupby(['member', 'vote']).size(), 17)

In [None]:
plt.figure(figsize=(12, 8))
sns.scatterplot(x=t[:, 0], y=t[:, 1], hue=t[:, 2]);

In [None]:
yes_no_count = yes_no.groupby('member').size()
regulars = vote_pivot[yes_no_count > 12]

In [None]:
regulars.merge(legs, left_index=True, right_on='leg_id')['party'].value_counts()

### PCA on the Regular Members

In [None]:
vote2d = pd.DataFrame({
    'member': regulars.index,
    'pc1': t[:, 0],
    'pc2': t[:, 1]
}).merge(legs, left_on='member', right_on='leg_id')
vote2d

In [None]:
plt.figure(figsize=(8, 5))
sns.scatterplot('pc1', 'pc2', hue='party',
                hue_order=['Democrat', 'Republican'], data=vote2d)

In [None]:
plt.figure(figsize=(8, 5))
sns.scatterplot('pc1', 'pc2', hue='party',
                hue_order=['Democrat', 'Republican'], data=vote2d)

Barbara Lee is the House Member for Berkeley's district. Any guesses to her party affiliation?

In [None]:
plt.figure(figsize=(8, 5))
sns.scatterplot('pc1', 'pc2', hue='party',
                hue_order=['Democrat', 'Republican'], data=vote2d)
plt.annotate('Barbara Lee',
             (vote2d.loc[b_lee, 'pc1'], vote2d.loc[b_lee, 'pc2']),
             xytext=(-0.075, 0.05),
             arrowprops=dict(facecolor='black'))

The first principal component seems to capture party affiliation. What about the second?

## Variance

So PCA with 2 dimensions captures about 5 times as much variance as just picking two columns.

## Interpretation

In [None]:
plt.figure(figsize=(8, 5))

num_votes = vt.shape[1]
votes = regulars.columns

def plot_pc(k):
    plt.bar(votes, vt[k, :], alpha=0.7)
    plt.xlim((min(votes)-1, max(votes)+1))
    plt.xticks(votes, rotation=90);

plot_pc(0)
plot_pc(1)

## Analyzing Votes

In [None]:
D = vote_pivot.values.T
m = D.shape[0]
X = (D - np.mean(D, axis=0)) / np.sqrt(m)
u, s, vt = np.linalg.svd(X, full_matrices=False)

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(np.arange(len(s)), s**2);
plt.scatter(np.arange(len(s)), s**2);
plt.xticks(np.arange(len(s)));

In [None]:
t = u * s
sns.scatterplot(x=t[:, 0], y=t[:, 1]);