### Setup

In [1]:
import pandas as pd
import ast
import os
from dotenv import load_dotenv
from sqlalchemy import (create_engine, Table, Column, Integer, String, Float, VARCHAR, MetaData)

In [2]:
local = pd.read_csv('datasets/crawler/local_codes.csv')
foreign = pd.read_csv('datasets/crawler/foreign_codes.csv')
codes = (
    pd.concat([local, foreign])
    .rename(columns={'name': 'freguesia'})
    .reset_index(drop=True)
)
load_dotenv()
user = os.getenv('USER')
password = os.getenv('PW')
host = os.getenv('SQL')
db = os.getenv('DB')
eng = (
    create_engine(
        'mysql+pymysql://' + user + ':' + password + '@' + host + '/' + db,
        pool_recycle=3600, echo=True)
)
meta = MetaData()

### Methods

In [3]:
# Returns resuts table df
def getResultsTable(file):
    resultsParty = file.loc['resultsParty', 'currentResults']

    results = (
        pd.DataFrame(ast.literal_eval(resultsParty))
        .drop(
            ['absoluteMajority', 'constituenctyCounter', 'imageKey', 'mandates', 'presidents'],
            axis=1)
        .rename(columns={'acronym': 'party'})
        .set_index('party')
    )

    return results

# Returns metadata df
def getResults(file, row):
    md = pd.Series(file['currentResults'])
    md.index.name = None
    md = (
        md.drop(
            ['availableMandates', 'compensation', 'displayMessage', 'hasNoVoting', 'resultsParty',
            'tie', 'tieMessage', 'totalBoycotts', 'totalForeignBoycotts', 'totalLocalBoycotts',
            'totalMandates', 'totalParishesApproved'])
        .to_frame()
        .T
        .assign(territoryKey = row['territoryKey'])
        .set_index('territoryKey')
    )

    loc = (
        pd.DataFrame(
            [row['parish'], row['county'], row['district'], row['territoryKey']]
            , index=['parish', 'county', 'district', 'territoryKey'])
        .T
        .set_index('territoryKey')
    )

    r = getResultsTable(file).drop(['percentage', 'validVotesPercentage'], axis=1).T
    r['territoryKey'] = row['territoryKey']
    r.columns.name = None
    r.set_index('territoryKey', inplace=True)

    return r, md, loc

# Creates CSV files
def create_csv():
    results = pd.DataFrame()
    metadata = pd.DataFrame()
    location = pd.DataFrame()

    for i, row in codes.iterrows():
        file = (
            pd.read_csv(
                'datasets/crawler/' + row['territoryKey']
                .split('-')[0]
                .capitalize() + '/' + row['district'] + '/' + row['county'] + '/' + row['parish'] + '.csv')
            .set_index('index')
        )

        r, md, loc = getResults(file, row)

        results = pd.concat([results, r])
        metadata = pd.concat([metadata, md])
        location = pd.concat([location, loc])

    mkdir()

    results.to_csv('datasets/results/results.csv')
    metadata.to_csv('datasets/results/metadata.csv')
    location.to_csv('datasets/results/location.csv')

# Creates directories
def mkdir():
    path = '/datasets/results/'

    working_dir = os.getcwd()
    newpath = working_dir + path

    if not os.path.exists(newpath):
        os.makedirs(newpath)

# Returns resuts, metadata and location dfs
def load_files():
    results = pd.read_csv('datasets/results/results.csv').set_index('territoryKey')
    metadata = pd.read_csv('datasets/results/metadata.csv').set_index('territoryKey')
    location = pd.read_csv('datasets/results/location.csv').set_index('territoryKey')

    return results, metadata, location

# Saves to database
def save_db(eng, results, metadata, location):
    strmax = get_strmax(location)

    results.to_sql(
        'results',
        con=eng,
        if_exists='replace',
        dtype={
            'territoryKey': VARCHAR(strmax['territoryKey']),
            'PPD/PSD.CDS-PP.PPM': Integer,
            'PS': Integer,
            'CH': Integer,
            'B.E.': Integer,
            'IL': Integer,
            'ADN': Integer,
            'L': Integer,
            'PAN': Integer,
            'PCP-PEV': Integer,
            'VP': Integer,
            'E': Integer,
            'JPP': Integer,
            'R.I.R.': Integer,
            'ND': Integer,
            'PCTP/MRPP': Integer,
            'MPT.A': Integer,
            'NC': Integer,
            'PTP': Integer,
            'PPD/PSD.CDS-PP': Integer,
            'PPM': Integer
        }
    )
    metadata.to_sql(
        'metadata',
        con=eng,
        if_exists='replace',
        dtype={
            'territoryKey': VARCHAR(strmax['territoryKey']),
            'blankVotes': Integer,
            'blankVotesPercentage': Float,
            'nullVotes': Integer,
            'nullVotesPercentage': Float,
            'numberParishes': Integer,
            'numberVoters': Integer,
            'percentageVoters': Float,
            'subscribedVoters': Integer,
            'totalVoters': Integer
        }
    )
    location.to_sql(
        'location',
        con=eng,
        if_exists='replace',
        dtype={
            'territoryKey': VARCHAR(strmax['territoryKey']),
            'parish': String(strmax['parish']),
            'county': String(strmax['county']),
            'district': String(strmax['district'])
        }
    )

# Loads from database
def load_db(eng):
    results = pd.read_sql('SELECT * FROM results', eng).set_index('territoryKey')
    metadata = pd.read_sql('SELECT * FROM metadata', eng).set_index('territoryKey')
    location = pd.read_sql('SELECT * FROM location', eng).set_index('territoryKey')

    return results, metadata, location

# Returns max length of each column
def get_strmax(location):
    lcol, strmax = location.columns.tolist(), pd.Series()
    str
    for col in lcol:
        if location[col].dtype == 'object':
            strmax[col] = location[col].str.len().max()

    return strmax

### Create Files and DBs

##### Create Results CSV

In [None]:
create_csv()

##### SQL Database

In [None]:
results, metadata, location = load_files()
save_db(eng, results, metadata, location)

### Explore Data

In [5]:
results, metadata, location = load_db(eng)

2024-05-01 13:35:19,163 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-05-01 13:35:19,163 INFO sqlalchemy.engine.Engine DESCRIBE `legislativas`.`SELECT * FROM results`
2024-05-01 13:35:19,164 INFO sqlalchemy.engine.Engine [raw sql] {}
2024-05-01 13:35:19,166 INFO sqlalchemy.engine.Engine SELECT * FROM results
2024-05-01 13:35:19,167 INFO sqlalchemy.engine.Engine [raw sql] {}
2024-05-01 13:35:19,305 INFO sqlalchemy.engine.Engine ROLLBACK
2024-05-01 13:35:19,307 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-05-01 13:35:19,307 INFO sqlalchemy.engine.Engine DESCRIBE `legislativas`.`SELECT * FROM metadata`
2024-05-01 13:35:19,308 INFO sqlalchemy.engine.Engine [raw sql] {}
2024-05-01 13:35:19,309 INFO sqlalchemy.engine.Engine SELECT * FROM metadata
2024-05-01 13:35:19,309 INFO sqlalchemy.engine.Engine [raw sql] {}
2024-05-01 13:35:19,363 INFO sqlalchemy.engine.Engine ROLLBACK
2024-05-01 13:35:19,365 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-05-01 13:35:19,365 INFO sqlalc

In [6]:
results

Unnamed: 0_level_0,PPD/PSD.CDS-PP.PPM,PS,CH,B.E.,IL,ADN,L,PAN,PCP-PEV,VP,E,JPP,R.I.R.,ND,PCTP/MRPP,MPT.A,NC,PTP,PPD/PSD.CDS-PP,PPM
territoryKey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
LOCAL-430101,199.0,158,56,10,9,4,4,3,1,1.0,0,0.0,0,,,,,,,
LOCAL-430102,599.0,624,196,85,47,13,33,36,29,2.0,1,0.0,5,,,,,,,
LOCAL-430103,438.0,447,136,28,46,5,22,21,18,0.0,0,2.0,0,,,,,,,
LOCAL-430104,862.0,577,162,80,70,8,39,22,20,1.0,0,1.0,0,,,,,,,
LOCAL-430105,302.0,153,66,16,18,5,15,12,10,1.0,0,1.0,0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
FOREIGN-930299,2306.0,745,330,87,110,60,62,152,81,31.0,4,23.0,21,10.0,,23.0,47.0,,,
FOREIGN-920799,2194.0,1188,1219,137,151,51,113,112,54,17.0,5,9.0,8,23.0,,1.0,17.0,,,
FOREIGN-919999,1084.0,803,646,78,105,49,49,29,20,5.0,2,7.0,3,7.0,,0.0,6.0,,,
FOREIGN-929999,1184.0,453,216,24,88,135,19,66,26,9.0,4,13.0,1,15.0,,11.0,12.0,,,


In [7]:
metadata

Unnamed: 0_level_0,blankVotes,blankVotesPercentage,nullVotes,nullVotesPercentage,numberParishes,numberVoters,percentageVoters,subscribedVoters,totalVoters
territoryKey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
LOCAL-430101,13,2.82,3,0.65,1,461,54.36,848,461
LOCAL-430102,31,1.80,19,1.10,1,1720,46.24,3720,1720
LOCAL-430103,33,2.73,14,1.16,1,1210,52.95,2285,1210
LOCAL-430104,35,1.85,10,0.53,1,1887,56.96,3313,1887
LOCAL-430105,7,1.14,9,1.46,1,615,55.91,1100,615
...,...,...,...,...,...,...,...,...,...
FOREIGN-930299,128,2.08,1937,31.46,1,6157,12.57,48982,6157
FOREIGN-920799,19,0.17,5592,51.26,1,10910,16.72,65242,10910
FOREIGN-919999,28,0.80,562,16.14,1,3483,5.48,63584,3483
FOREIGN-929999,92,2.68,1068,31.08,1,3436,5.47,62856,3436


In [None]:
location

### Test

In [None]:
row = codes.iloc[0]
file = (
    pd.read_csv(
        'datasets/crawler/' + row['territoryKey']
        .split('-')[0]
        .capitalize() + '/' + row['district'] + '/' + row['county'] + '/' + row['parish'] + '.csv')
    .set_index('index')
)

r = getResultsTable(file).drop(['percentage', 'validVotesPercentage'], axis=1).T
r['territoryKey'] = row['territoryKey']
r.columns.name = None
r.set_index('territoryKey', inplace=True)

In [None]:
r
## TODO: convert PPD/PSD.CDS-PP + PPM to agregate

### Create Views

In [None]:
class Parish:
    def __init__(self, territoryKey, results):
        self.territoryKey = territoryKey
        self.results = results

    @property
    def territoryKey(self):
        return self._territoryKey
    @territoryKey.setter
    def territoryKey(self, territoryKey):
        self._territoryKey = territoryKey

    @property
    def results(self):
        return self._results
    @results.setter
    def results(self, results):
        self._results = results

    @property
    def county(self):
        return self._county
    @county.setter
    def county(self, county):
        self._county = county

    @property
    def district(self):
        return self._district
    @district.setter
    def district(self, district):
        self._district = district

    @property

In [None]:
r_full = pd.concat([results, metadata], axis=1)
parties = results.columns.tolist()
r_full

In [None]:
r_full = r_full