# Setup

In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)
pd.options.display.max_colwidth=300
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
colors = ['r', 'b']
colors_3 = ['r', 'b', '#808080']
colorm = ['C2', 'gold', 'darkorange', 'orchid', 'c', 'C7', 'C6', 'C5', 'C9', 'm', 'g', 'y', 'dimgrey']

In [None]:
plt.rc('font', size=16)

# Reading Data

Cooperative Congressional Election Survey (CCES) 2016 <br>
https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi%3A10.7910/DVN/GDF6Z0

In [None]:
cces = pd.io.stata.StataReader('CCES16_Common_OUTPUT_Feb2018_VV.dta')
df = cces.read(convert_categoricals=False) # categorical variables are coded with integers

In [None]:
cces = pd.io.stata.StataReader('CCES16_Common_OUTPUT_Feb2018_VV.dta')
dfc = cces.read(convert_categoricals=True) # categorical variables are coded with strings

In [None]:
df.shape

In [None]:
var_label = cces.variable_labels()
var_label

In [None]:
# some systematic overview

v_dem = ['birthyr', 'gender', 'educ', 'race', 'immstat', 'CC16_361', 'citylength_1',
         'employ', 'hadjob', 'industryclass', 'ownhome', 'faminc', 'investor', 'edloan',
         'child18', 'child18num', 'marstat',
         'pew_bornagain', 'pew_religimp', 'pew_churatd', 'pew_prayer', 'religpew',
         'milstat_1', 'milstat_2', 'milstat_3', 'milstat_4', 'milstat_5',
         'union', 'unionhh', 'newsint', 'internethome',
         'healthins_1', 'healthins_2', 'healthins_3', 'healthins_4', 'healthins_5', 'healthins_6']

v_perception = ['CC16_302', 'CC16_304',
                'CC16_427_a', 'CC16_427_b', 'CC16_427_c', 'CC16_427_d', 'CC16_427_e', 'CC16_427_f']

v_past_four = ['CC16_303', 'CC16_305_1', 'CC16_305_2', 'CC16_305_3', 'CC16_305_4', 'CC16_305_5',
                  'CC16_305_6', 'CC16_305_7', 'CC16_305_8', 'CC16_305_9', 'CC16_305_10', 'CC16_305_11']

v_opinion = ['CC16_307', 'CC16_330a', 'CC16_330b', 'CC16_330d', 'CC16_330e',
             'CC16_331_1', 'CC16_331_2', 'CC16_331_3', 'CC16_331_7', 'CC16_331_9',
             'CC16_332a', 'CC16_332b', 'CC16_332c', 'CC16_332d', 'CC16_332e', 'CC16_332f',
             'CC16_333a', 'CC16_333b', 'CC16_333c', 'CC16_333d',
             'CC16_334a', 'CC16_334b', 'CC16_334c', 'CC16_334d', 'CC16_335',
             'CC16_337_1', 'CC16_337_2', 'CC16_337_3',
             'CC16_351B', 'CC16_351E', 'CC16_351F', 'CC16_351G', 'CC16_351H', 'CC16_351I', 'CC16_351K',
             'CC16_414_1', 'CC16_414_2', 'CC16_414_3', 'CC16_414_4', 'CC16_414_5', 'CC16_414_6', 'CC16_414_7',
             'CC16_415r', 'CC16_416r', 'CC16_422c', 'CC16_422d', 'CC16_422e', 'CC16_422f',
             'CC16_426_1', 'CC16_426_2', 'CC16_426_3', 'CC16_426_4', 'CC16_426_5']

In [None]:
var_label.get('CC16_302')

In [None]:
dfc[v_dem].head(3)

In [None]:
df[v_dem].head()

# Unweighted Data

## Labeled

dataset with numerical variables (df)

In [None]:
var_label.get('CC16_410a')

In [None]:
df.CC16_410a.value_counts().sort_index()

In [None]:
value_labels = cces.value_labels()
value_labels

In [None]:
var_values = cces.lbllist
var_values

In [None]:
value_labels.get(var_values[df.columns.get_loc('CC16_410a')])

In [None]:
# create variable with vote (Trump - Clinton - Other)

df = df.assign(vote=df.CC16_410a)
df.vote.replace([4, 5, 8], 3, inplace=True)
df.vote.replace([6, 7, 98, 99], np.nan, inplace=True)

In [None]:
vote_label = {1: 'Trump', 2: 'Clinton', 3: 'Other'}
df.vote.map(vote_label).value_counts().reindex(vote_label.values())

In [None]:
# Level of Education (educ)

df.educ.map(value_labels.get(var_values[df.columns.get_loc('educ')])).value_counts().reindex(value_labels.get(var_values[df.columns.get_loc('educ')]).values())

In [None]:
# Religion (religpew)

df.religpew.map(value_labels.get(var_values[df.columns.get_loc('religpew')])).value_counts()

In [None]:
# Perception of National Economy in past year

print(var_label.get('CC16_302'))
display(df.CC16_302.value_counts().sort_index())
df.CC16_302.map(value_labels.get(var_values[df.columns.get_loc('CC16_302')])).value_counts().reindex(value_labels.get(var_values[df.columns.get_loc('CC16_302')]).values())

---

--> Pick two to four variables of your interest and inspect labels and distribution. Have a look at the codebook!

---

## Categorical

dataset with categorical variables (dfc)

In [None]:
dfc.dtypes

In [None]:
dfc[v_dem].dtypes

In [None]:
dfc.CC16_410a.value_counts().sort_index() # mapping is not needed

In [None]:
dfc.CC16_410a.head() # note that categories should be unordered for this variable

In [None]:
# create vote variable like above

dfc = dfc.assign(vote=dfc.CC16_410a)

dfc.vote.replace(['Gary Johnson (Libertarian)', 'Jill Stein (Green)', 'Evan McMullin (Independent)'], 'Other', inplace=True)
dfc.vote.cat.remove_categories(["I didn't vote in this election", "I'm not sure"], inplace=True)
dfc.vote.cat.remove_unused_categories(inplace=True)
dfc.vote.cat.rename_categories({'Donald Trump (Republican)': 'Trump', 'Hillary Clinton (Democrat)': 'Clinton'}, inplace=True)
dfc.vote.cat.as_unordered(inplace=True)
dfc.vote.value_counts().sort_index()

In [None]:
dfc.vote.dtype

In [None]:
dfc.educ.dtype

In [None]:
display(dfc.religpew.dtype)
dfc.religpew.cat.as_unordered(inplace=True)

In [None]:
dfc.CC16_302.dtype

In [None]:
sns.catplot(x='vote', y='educ', kind='box', data=df)

In [None]:
sns.catplot(x='educ', y='birthyr', hue='vote', kind='box', data=df)

In [None]:
sns.catplot(x='vote', y='CC16_302', kind='violin', data=df)

---

Look for other categorical plots with seaborn and visualize the variables you chose above.

---

# weighted data

## vote

In [None]:
dfc.vote.value_counts(normalize=True).sort_index()*100

In [None]:
plt.figure(figsize = (7,7))
patches, texts, autotexts = plt.pie(dfc.vote.value_counts(normalize=True).sort_index(),
        colors=colors_3, counterclock=False, startangle=90, autopct='%1.0f%%', labels=dfc.vote.cat.categories)
for autotext in autotexts:
    autotext.set_color('white')

the popular vote 2016:
- 46.2% for Trump --> large difference to frequency above (42%)
- 48.2% for Clinton

In [None]:
print(dfc.commonweight_vv_post.sum()) # commonweight_vv_post: validated, post-election
dfc[dfc.commonweight_vv_post.notna()].shape # sum of weights equals number of observations

In [None]:
# pivot-table, alternative groupby with weight
display(pd.pivot_table(dfc, values='commonweight_vv_post', index='vote', aggfunc=sum)) # some error with categories
dfc.vote.replace(["I didn't vote in this election", "I'm not sure"], np.nan, inplace=True)

In [None]:
pd.pivot_table(dfc, values='commonweight_vv_post', index='vote', aggfunc=sum)

In [None]:
plt.figure(figsize = (7,7))
patches, texts, autotexts = plt.pie(pd.pivot_table(dfc, values='commonweight_vv_post', index='vote', aggfunc=sum),
        colors=colors_3, counterclock=False, startangle=90, autopct='%1.0f%%', labels=vote_label.values())
for autotext in autotexts:
    autotext.set_color('white')

In [None]:
# relative frequencies

totals = pd.pivot_table(dfc, values='commonweight_vv_post', index='vote', aggfunc=sum, margins=True)
totals.div(totals.loc['All'], axis=1) *100

## other variables

In [None]:
# education by level of education

totals = pd.pivot_table(dfc, values='commonweight_vv_post', index='educ', columns='vote', aggfunc=sum, margins=True)
totals.div(totals.loc['All'], axis=1) *100

In [None]:
# education by candidate

totals.div(totals.All, axis=0) *100

In [None]:
def weighted_frequency(colm):
    one = pd.pivot_table(dfc, values='commonweight_vv_post', index=colm, columns='vote', aggfunc=sum, margins=True)
    return one

def weighted_vote(colm):
    one = weighted_frequency(colm)
    pcol = one.div(one.All, axis=0) *100
    return pcol

def weighted_feature(colm):
    one = weighted_frequency(colm)
    prow = one.div(one.loc['All'], axis=1) *100
    return prow

In [None]:
weighted_vote('religpew')

In [None]:
weighted_feature('religpew').sort_values('All')

## Visualization

In [None]:
weighted_vote('educ').plot(kind='barh', stacked=True, color=colors_3, legend=False, xlim=(0,100), width=0.8, figsize=(8,5))
plt.xlabel('Percentage')
plt.ylabel('')

In [None]:
ax = weighted_feature('educ').iloc[:-1][['Other', 'Clinton', 'Trump', 'All']].transpose().plot(kind='barh', stacked=True, color=colorm, rot=360, ylim=(0,100), width=0.8, figsize=(8,5))
ax.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))

In [None]:
ax = weighted_feature('religpew').iloc[:-1].sort_values('All', ascending=False).transpose().plot(kind='bar', stacked=True, color=colorm, rot=360, ylim=(0,100), width=0.8, figsize=(8,5))
handles, labels2 = ax.get_legend_handles_labels()
ax.legend(reversed(handles), reversed(labels2), loc='center left', bbox_to_anchor=(1.0, 0.5))

In [None]:
ax = weighted_feature('CC16_302').iloc[:-1][['Other', 'Clinton', 'Trump', 'All']].transpose().plot(kind='barh', stacked=True, color=colorm, rot=360, ylim=(0,100), width=0.8, figsize=(8,5))
ax.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))

---

Calculate the relative frequencies according to the voting outcome and visualize them for the variables of your interest.

---