# Data preparation script for Avoin_data_eduskuntavaalit_2019_valintatiedot

This script prepares the YLE 2019 election compass data for clustercards generation.

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

from datetime import datetime
from collections import defaultdict
import os

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from yamlconfig import read_config

## Read and define variables

In [None]:
config = read_config()
c = config['vaalit19']
print('Settings:')
print(c)

In [None]:
DATADIR = c['datadir']
DATAFILE = c['datafile']
METAFILE = c['metafile']
OUTPREFIX = c['outprefix']
FILTER_REGION = c['filter_region']

INDEX_COL = None
CSV_SEP = ','
NAN_LIMIT = 0.1

In [None]:
datafilename = "{}/{}".format(DATADIR, DATAFILE)
metafilename = "{}/{}".format(DATADIR, METAFILE)
assert os.path.isfile(datafilename), "File missing"
#assert os.path.isfile(metafilename), "File missing"

## Read data

### Metadata for variables

In [None]:
df_vars = pd.read_excel(metafilename, index_col=0)
background = df_vars.loc['Taustamuuttuja']>0
background = background[background].index.values

In [None]:
len(df_vars.columns)

### Actual data variables

In [None]:
df = pd.read_csv(datafilename, sep=CSV_SEP, index_col=INDEX_COL)
df

## Process data

### Drop variables

In [None]:
df2 = df.drop(df.columns[32:61], axis=1)
df2 = df2.drop(df2.columns[33:36], axis=1)
df2 = df2.drop(df2.columns[41:180], axis=1)

In [None]:
df2.info(verbose=True)

### Convert to floats

In [None]:
df2['Kuinka vanha olet?'] = df2['Kuinka vanha olet?'].str.replace("v", "")
df2['Kuinka vanha olet?'] = df2['Kuinka vanha olet?'].str.replace("+", "", regex=False)
df2['Kuinka vanha olet?'] = df2['Kuinka vanha olet?'].str.replace("ä", "")
df2['Kuinka vanha olet?'] = df2['Kuinka vanha olet?'].str.replace("p", "")

In [None]:
df2 = df2.replace('-',np.NaN)
df2 = df2.replace('NaN',np.NaN)
df2 = df2.replace('  ',np.NaN)
df2 = df2.replace('---',np.NaN)

In [None]:
for i, c in enumerate(df2.columns):
    print(i, c)
    if i>2 and i<33: df2[c] = df2[c].astype(float)

### NaNs

#### Drop candidates

In [None]:
df2 = df2.dropna(axis=0, thresh=36)
df2.isna().sum(axis = 1).sort_values()

#### Drop variables

In [None]:
ser_nans = df2.isnull().sum()
ser_too_many_nans = ser_nans[ser_nans>NAN_LIMIT*len(df2)]
too_many_nans = list(ser_too_many_nans.index.values)
print('Removing variables with more than {} NaNs:'.format(NAN_LIMIT*len(df2)))
print(ser_too_many_nans)
print()
df2 = df2.drop(ser_too_many_nans.index, axis=1)
df2.info(verbose=False)

In [None]:
df2.isnull().sum().sort_values(ascending=False).plot()

In [None]:
if 1: 
    ser_mode = df2.mode().transpose().squeeze()
    ser_mean = df2.mean()

    for c in ['vaalipiiri', 'puolue']: 
        df2[c].fillna(ser_mode[c], inplace=True)

    df2['Sukupuolesi'].fillna('En halua kertoa', inplace=True)

    for c in df2.columns:
        if c in ser_mean:
            df2[c].fillna(ser_mean[c], inplace=True)

In [None]:
df2.isna().sum(axis = 0).sort_values()

In [None]:
df2.Sukupuolesi.unique()

### party variables

In [None]:
for p in df2['puolue'].unique():
    df2[p] = (df2['puolue']==p).astype(int)

#### replace column names

In [None]:
assert len(df2.columns) == len(df_vars.columns)
df2.columns = df_vars.columns
df2

### Background variables

In [None]:
df_bg = df2[background]
df2 = df2.drop(background, axis=1)
print(len(background), 'background variables removed:', background)
print('Actual data has', len(df2), 'rows,', len(df2.columns), 'columns')
print('Background data has', len(df_bg), 'rows,', len(df_bg.columns), 'columns')

## save results

In [None]:
now = datetime.now()
todaystr = now.strftime("%Y-%m-%d")
outfile = "{}/{}-data-{}.csv".format(DATADIR, OUTPREFIX, todaystr)
print(outfile)
df2.to_csv(outfile, index=False)

In [None]:
now = datetime.now()
todaystr = now.strftime("%Y-%m-%d")
outfile = "{}/{}-bg-{}.csv".format(DATADIR, OUTPREFIX, todaystr)
print(outfile)
df_bg.to_csv(outfile, index=False)

In [None]:
df_bg