In [1]:
import pandas as pd
import numpy as np
from settings import Config
from mysql_db import Database
import pdcast as pdc
import s3_upload_download as s3con
import os

# NOTEBOOK DESCRIPTION: 
1. Transfers tables from MySQL to Python; 
2. Downcasts variable types wherever possible;

The following tables from MySQL are used: 
'att1_type', 'att2_age', 'att3_judet', 'att4_localitate', 'att5_sex', 'att6_category_type',
'att7_make', 'att8_capacity', 'att9_power', 'att10_seats', 'att11_year', 'att12_weight',
'att13_bonus_malus', 'att14_no_rates', 'daune', 'events_no', and 'polite_exp'.

Downcasting is done using the automation tool pdcast (pandas downcasting).

The resulting tables are saved in an S3 bucket.

In [2]:
# initialise the Database object to establish a connection to the MySQL database
db = Database(Config)

# initialise the s3_connector object needed to read/write files into an S3 bucket
s3con = s3con.s3_connector()

# list of all table names to be transferred
table_names = ['att1_type', 'att2_age', 'att3_judet', 'att4_localitate', 'att5_sex', 'att6_category_type',
                'att7_make', 'att8_capacity', 'att9_power', 'att10_seats', 'att11_year', 'att12_weight',
                'att13_bonus_malus', 'att14_no_rates', 'daune', 'events_no', 'polite_exp']



## a. Attributes

In [3]:
df = db.run_view(f'SELECT * FROM {table_names[0]}')
df.head()

Unnamed: 0,idPolita,tip
0,2230446,PF
1,2230447,PF
2,2230448,PF
3,2230449,PF
4,2230450,PF


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27782469 entries, 0 to 27782468
Data columns (total 2 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   idPolita  int64 
 1   tip       object
dtypes: int64(1), object(1)
memory usage: 423.9+ MB


In [5]:
df_downcast = pdc.downcast(df)
df_downcast.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27782469 entries, 0 to 27782468
Data columns (total 2 columns):
 #   Column    Dtype   
---  ------    -----   
 0   idPolita  uint32  
 1   tip       category
dtypes: category(1), uint32(1)
memory usage: 132.5 MB


In [6]:
df_downcast.head()

Unnamed: 0,idPolita,tip
0,2230446,PF
1,2230447,PF
2,2230448,PF
3,2230449,PF
4,2230450,PF


In [7]:
df_downcast.to_feather(f'{table_names[0]}.feather')
s3con.write(f'{table_names[0]}.feather')
os.remove(f'{table_names[0]}.feather')

In [None]:
# automate the rest of the attribute tables since they are similar to each other
# NOTE: some of the bigger files had to be broken down into smaller files due to insufficient RAM

for table in range(12, len(table_names)-1):
    df = db.run_view(f'SELECT * FROM {table_names[table]}')
    df = pdc.downcast(df)
    df.to_feather(f'{table_names[table]}.feather')
    del df
    s3con.write(f'{table_names[table]}.feather')
    os.remove(f'{table_names[table]}.feather')

    print('Table done')

## Exposures

In [3]:
# the exposures table is too big and thus needs to be sectioned into smaller parts
section_start = [0, 4999999, 9999998, 14999997, 19999996, 24999995]

for start in range(len(section_start)):
    df = db.run_view(f'SELECT * FROM polite_exp LIMIT {section_start[start]}, 5000000')
    df = pdc.downcast(df)
    df.to_feather(f'polite_exp_{start}.feather')
    del df

    s3con.write(f'polite_exp_{start}.feather')
    os.remove(f'polite_exp_{start}.feather')

    print('Table done')
    

Table done
Table done
Table done
Table done
Table done
Table done


In [4]:
# recombine into 2 parts and save in S3
df1 = s3con.read('polite_exp_0.feather')
df2 = s3con.read('polite_exp_1.feather')
con = pd.concat([df1, df2], axis = 0, ignore_index= True)
del df1, df2

df3 = s3con.read('polite_exp_2.feather')
con = pd.concat([con, df3], axis = 0, ignore_index= True)
del df3

df4 = s3con.read('polite_exp_3.feather')
con = pd.concat([con, df4], axis = 0, ignore_index= True)
del df4

df5 = s3con.read('polite_exp_4.feather')
con = pd.concat([con, df5], axis = 0, ignore_index= True)
del df5

df6 = s3con.read('polite_exp_5.feather')
con = pd.concat([con, df6], axis = 0, ignore_index= True)
del df6

con = pdc.downcast(con)
con.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27598537 entries, 0 to 27598536
Data columns (total 12 columns):
 #   Column     Dtype   
---  ------     -----   
 0   idPolita   uint32  
 1   dataStart  category
 2   dataEnd    category
 3   maturity   uint8   
 4   exp_2015   category
 5   exp_2016   category
 6   exp_2017   category
 7   exp_2018   category
 8   exp_2019   category
 9   exp_2020   category
 10  exp_2021   category
 11  exp_2022   category
dtypes: category(10), uint32(1), uint8(1)
memory usage: 658.3 MB


In [5]:
con.to_feather('polite_exp.feather')
s3con.write('polite_exp.feather')
os.remove('polite_exp.feather')