### Setup

In [None]:
import pandas as pd
import ast
import os
from dotenv import load_dotenv
from sqlalchemy import (create_engine, Table, Column, Integer, String, Float, VARCHAR, MetaData)

In [None]:
local = pd.read_csv('datasets/crawler/local_codes.csv')
foreign = pd.read_csv('datasets/crawler/foreign_codes.csv')
codes = (
    pd.concat([local, foreign])
    .rename(columns={'name': 'freguesia'})
    .reset_index(drop=True)
)
load_dotenv()
user = os.getenv('USER')
password = os.getenv('PW')
host = os.getenv('SQL')
db = os.getenv('DB')
eng = (
    create_engine(
        'mysql+pymysql://' + user + ':' + password + '@' + host + '/' + db,
        pool_recycle=3600, echo=True)
)
meta = MetaData()

### Methods

In [None]:
# Returns resuts table df
def getResultsTable(file):
    resultsParty = file.loc['resultsParty', 'currentResults']

    results = (
        pd.DataFrame(ast.literal_eval(resultsParty))
        .drop(
            ['absoluteMajority', 'constituenctyCounter', 'imageKey', 'mandates', 'presidents'],
            axis=1)
        .rename(columns={'acronym': 'party'})
        .set_index('party')
    )

    return results

# Returns metadata df
def getResults(file, row):
    md = pd.Series(file['currentResults'])
    md.index.name = None
    md = (
        md.drop(
            ['availableMandates', 'compensation', 'displayMessage', 'hasNoVoting', 'resultsParty',
            'tie', 'tieMessage', 'totalBoycotts', 'totalForeignBoycotts', 'totalLocalBoycotts',
            'totalMandates', 'totalParishesApproved', 'blankVotesPercentage', 'nullVotesPercentage', 'percentageVoters', 'numberParishes', 'totalVoters'])
        .to_frame()
        .T
        .assign(territoryKey = row['territoryKey'])
        .set_index('territoryKey')
    )

    r = getResultsTable(file).drop(['percentage', 'validVotesPercentage'], axis=1).T
    r['territoryKey'] = row['territoryKey']
    r.set_index('territoryKey', inplace=True)
    r.columns.name = None

    loc = (
        pd.DataFrame(
            [row['parish'], row['county'], row['district'], row['territoryKey']]
            , index=['parish', 'county', 'district', 'territoryKey'])
        .T
        .set_index('territoryKey')
    )

    return r, md, loc

# Creates CSV files
def create_csv():
    results = pd.DataFrame()
    metadata = pd.DataFrame()
    location = pd.DataFrame()

    for i, row in codes.iterrows():
        file = (
            pd.read_csv(
                'datasets/crawler/' + row['territoryKey']
                .split('-')[0]
                .capitalize() + '/' + row['district'] + '/' + row['county'] + '/' + row['parish'] + '.csv')
            .set_index('index')
        )

        r, md, loc = getResults(file, row)

        results = pd.concat([results, r])
        metadata = pd.concat([metadata, md])
        location = pd.concat([location, loc])

    results = pd.concat([results, metadata], axis=1)

    mkdir()

    results.to_csv('datasets/results/results.csv')
    location.to_csv('datasets/results/location.csv')

# Creates directories
def mkdir():
    path = '/datasets/results/'

    working_dir = os.getcwd()
    newpath = working_dir + path

    if not os.path.exists(newpath):
        os.makedirs(newpath)

# Returns resuts, metadata and location dfs
def load_files():
    results = pd.read_csv('datasets/results/results.csv').set_index('territoryKey')
    location = pd.read_csv('datasets/results/location.csv').set_index('territoryKey')

    return results, location

# Saves to database
def save_db(eng, results, location):
    strmax = get_strmax(location)

    results.to_sql(
        'results',
        con=eng,
        if_exists='replace',
        dtype={
            'territoryKey': VARCHAR(strmax['territoryKey']),
            'PPD/PSD.CDS-PP.PPM': Integer,
            'PS': Integer,
            'CH': Integer,
            'B.E.': Integer,
            'IL': Integer,
            'ADN': Integer,
            'L': Integer,
            'PAN': Integer,
            'PCP-PEV': Integer,
            'VP': Integer,
            'E': Integer,
            'JPP': Integer,
            'R.I.R.': Integer,
            'ND': Integer,
            'PCTP/MRPP': Integer,
            'MPT.A': Integer,
            'NC': Integer,
            'PTP': Integer,
            'PPD/PSD.CDS-PP': Integer,
            'PPM': Integer,
            'blankVotes': Integer,
            'nullVotes': Integer,
            'numberVoters': Integer,
            'subscribedVoters': Integer
        }
    )
    location.to_sql(
        'location',
        con=eng,
        if_exists='replace',
        dtype={
            'territoryKey': VARCHAR(strmax['territoryKey']),
            'parish': String(strmax['parish']),
            'county': String(strmax['county']),
            'district': String(strmax['district'])
        }
    )

# Loads from database
def load_db(eng):
    results = pd.read_sql('SELECT * FROM results', eng).set_index('territoryKey')
    location = pd.read_sql('SELECT * FROM location', eng).set_index('territoryKey')

    return results, location

# Returns max length of each column
def get_strmax(location):
    lcol, strmax = location.columns.tolist(), pd.Series()
    strmax['territoryKey'] = location.index.str.len().max()
    for col in lcol:
        if location[col].dtype == 'object':
            strmax[col] = location[col].str.len().max()

    return strmax

### Create Files and DBs

##### Create Results CSV

In [None]:
create_csv()

##### SQL Database

In [None]:
results, location = load_files()
save_db(eng, results, location)

In [None]:
results, location = load_db(eng)