In [1]:
import pandas as pd
import numpy as np
from settings import Config
from mysql_db import Database
import pdcast as pdc
import s3_upload_download as s3con
import os
from datetime import datetime

# NOTEBOOK DESCRIPTION: 

In the CLAIMS table (daune), a group of policies were found, which did not appear in any of the other tables. The missing information was later acquired and stored in an S3 bucket., 

Here, the newly acquired data must be:
1. Reviewed and cleaned;
2. Integrated into the existing tables.

In [2]:
# initialise the s3_connector object needed to read/write files into an S3 bucket
s3con = s3con.s3_connector()

In [3]:
# load the files
new_pol = pd.read_csv('left_polite.csv', sep = ';')
new_obj = pd.read_csv('left_obiecte.csv', sep = ';')
new_cli = pd.read_csv('left_clienti.csv', sep = ';')

## CLIENTS

In [13]:
new_cli.head()

Unnamed: 0,id,tp,Tip,Varsta,Judet,Localitate,ID_polita,TipPolita,sex,ID_client
0,,ASIGURAT,PJ,,CLUJ,DEJ,15634668,RCA,,56f98853094ccb2602e9353e4a1c5a5c
1,,ASIGURAT,PJ,,ARGES,CAMPULUNG,21625659,RCA,,c549c436477e4f32a896676179ecfbd7
2,,ASIGURAT,PJ,,CLUJ,CLUJ NAPOCA,19294623,RCA,,fd372796873e1de997eb69d3bad0dc2b
3,,ASIGURAT,PJ,,CLUJ,CLUJ NAPOCA,18622563,RCA,,fd372796873e1de997eb69d3bad0dc2b
4,,ASIGURAT,PJ,,BISTRITA NASAUD,BISTRITA,201700161,RCA,,759341262e3ce21e218caeaf49a1c6e3


In [4]:
new_cli.isnull().sum()

id            6350
tp               0
Tip              0
Varsta        4198
Judet            0
Localitate       0
ID_polita        0
TipPolita        0
sex           4198
ID_client        0
dtype: int64

In [20]:
clean_cli = (
    new_cli
    .filter(items = ['Tip',	'Varsta', 'Judet', 'Localitate', 'ID_polita', 'sex'])
    .rename(columns = {'Tip':'tip', 'Varsta':'varsta', 'Judet':'judet', 'Localitate':'localitate',
                            'ID_polita':'idPolita'}) # to have same col heads as the original tables
    .fillna(value = {'varsta': 321, 'sex' : 'PJ'}) # nans here come from legal persons and must be replaced
    .drop_duplicates(subset = ['idPolita'], keep = 'last') # prefer to keep USER, which tends to be the second in line
    .query("idPolita > 2230446") # policies with smaller IDs are from before 2015
)

clean_cli = pdc.downcast(clean_cli)
clean_cli.head()

Unnamed: 0,tip,varsta,judet,localitate,idPolita,sex
3176,PF,63,TIMIS,GIARMATA,2272279,M
3177,PF,37,BUCURESTI,BUCURESTI SECTORUL 3,2356008,F
3178,PF,37,CONSTANTA,CONSTANTA,2465166,M
3179,PF,61,SIBIU,JINA,2567957,F
3180,PF,55,PRAHOVA,TARGSORU VECHI,2691786,M


In [16]:
print('Complete duplicates', clean_cli.duplicated().sum())
print('Policy ID duplicates', clean_cli.duplicated('idPolita').sum())
print('Nans', clean_cli.duplicated().sum())

Complete duplicates 0
Policy ID duplicates 0
Nans 0


NOTE: The new policies were found to actually already be present in the original CLIENTS table (clienti), and as such there was no need to add them. The new policies were missing only in the OBJECTS and POLICIES tables.

## OBJECTS

In [30]:
new_obj.head()

Unnamed: 0,ID,TIP,TipVehiculAsigurat,Categ,Marca,Model,Capacitate,Putere,Nr_locuri,An_fabr,Masa_max,0
0,32972,RCA,Inmatriculat,Autoturism,DAEWOO,MATIZ,796,38,5,2001,1210,0
1,2272279,RCA,Inmatriculat,Autoturism,FORD,FOCUS,1753,66,5,2000,1755,0
2,2356008,RCA,Inmatriculat,Autoturism,PEUGEOT,207,1360,55,5,2007,1450,0
3,2465166,RCA,Inmatriculat,Autoturism,SEAT,LEON,1598,77,5,2003,1717,0
4,2567957,RCA,Inmatriculat,Autoturism,OPEL,CORSA,973,40,5,2000,1390,0


In [33]:
print('Row count', new_obj.shape[0])
print('Complete duplicates', new_obj.duplicated().sum())
print('Policy ID duplicates', new_obj.duplicated('ID').sum())
print('Nans', new_obj.duplicated().sum())

Row count 3175
Complete duplicates 0
Policy ID duplicates 0
Nans 0


In [34]:
clean_obj = (
    new_obj
    .filter(items = ['ID', 'TipVehiculAsigurat', 'Categ', 'Marca', 'Model',
                    'Capacitate', 'Putere', 'Nr_locuri', 'An_fabr', 'Masa_max'])
    .rename(columns = {'ID':'idPolita', 'TipVehiculAsigurat':'tipVehicul', 'Categ':'categorie', 
                        'Marca':'marca', 'Model':'model', 'Capacitate':'capacitate', 'Putere':'putere', 
                        'Nr_locuri':'locuri', 'An_fabr':'anFabricatie', 'Masa_max':'masaTehnica'}) # to have same col heads as the original tables
    .query("idPolita > 2230446") # policies with smaller IDs are from before 2015
)

clean_obj = pdc.downcast(clean_obj)
clean_obj.head()

Unnamed: 0,idPolita,tipVehicul,categorie,marca,model,capacitate,putere,locuri,anFabricatie,masaTehnica
1,2272279,Inmatriculat,Autoturism,FORD,FOCUS,1753,66,5,2000,1755
2,2356008,Inmatriculat,Autoturism,PEUGEOT,207,1360,55,5,2007,1450
3,2465166,Inmatriculat,Autoturism,SEAT,LEON,1598,77,5,2003,1717
4,2567957,Inmatriculat,Autoturism,OPEL,CORSA,973,40,5,2000,1390
5,2691786,Inmatriculat,Autoturism,DACIA,LOGAN,1461,48,5,2004,1540


In [36]:
# Attribute 6
df = s3con.read('att6_category_type.feather')
con = pd.concat([df, clean_obj], join = 'inner', ignore_index = True)
print(df.shape[0])
print(con.shape[0])

27598519
27601693


In [41]:
# save to S3 bucket and remove
con.to_feather('att6_category_type.feather')
s3con.write('att6_category_type.feather')
os.remove('att6_category_type.feather')

In [46]:
# Attribute 7
df = s3con.read('att7_make.feather')
con = pd.concat([df, clean_obj], join = 'inner', ignore_index = True)
print('Original size:', df.shape[0])
print('Size after concat:', con.shape[0])
print('Complete duplicates:', con.duplicated().sum())
print('Policy ID duplicates:', con.duplicated('idPolita').sum())

Original size: 27598519
Size after concat: 27601693
Complete duplicates: 0
Policy ID duplicates: 0


In [47]:
# save to S3 bucket and remove
con.to_feather('att7_make.feather')
s3con.write('att7_make.feather')
os.remove('att7_make.feather')

In [48]:
# Attribute 8
df = s3con.read('att8_capacity.feather')
con = pd.concat([df, clean_obj], join = 'inner', ignore_index = True)
print('Original size:', df.shape[0])
print('Size after concat:', con.shape[0])
print('Complete duplicates:', con.duplicated().sum())
print('Policy ID duplicates:', con.duplicated('idPolita').sum())

Original size: 27598519
Size after concat: 27601693
Complete duplicates: 0
Policy ID duplicates: 0


In [51]:
# save to S3 bucket and remove
con.to_feather('att8_capacity.feather')
s3con.write('att8_capacity.feather')
os.remove('att8_capacity.feather')

In [52]:
# Attribute 9
df = s3con.read('att9_power.feather')
con = pd.concat([df, clean_obj], join = 'inner', ignore_index = True)
print('Original size:', df.shape[0])
print('Size after concat:', con.shape[0])
print('Complete duplicates:', con.duplicated().sum())
print('Policy ID duplicates:', con.duplicated('idPolita').sum())

Original size: 27598519
Size after concat: 27601693
Complete duplicates: 0
Policy ID duplicates: 0


In [53]:
# save to S3 bucket and remove
con.to_feather('att9_power.feather')
s3con.write('att9_power.feather')
os.remove('att9_power.feather')

In [54]:
# Attribute 10
df = s3con.read('att10_seats.feather')
con = pd.concat([df, clean_obj], join = 'inner', ignore_index = True)
print('Original size:', df.shape[0])
print('Size after concat:', con.shape[0])
print('Complete duplicates:', con.duplicated().sum())
print('Policy ID duplicates:', con.duplicated('idPolita').sum())

Original size: 27598519
Size after concat: 27601693
Complete duplicates: 0
Policy ID duplicates: 0


In [55]:
# save to S3 bucket and remove
con.to_feather('att10_seats.feather')
s3con.write('att10_seats.feather')
os.remove('att10_seats.feather')

In [56]:
# Attribute 11
df = s3con.read('att11_year.feather')
con = pd.concat([df, clean_obj], join = 'inner', ignore_index = True)
print('Original size:', df.shape[0])
print('Size after concat:', con.shape[0])
print('Complete duplicates:', con.duplicated().sum())
print('Policy ID duplicates:', con.duplicated('idPolita').sum())

Original size: 27598519
Size after concat: 27601693
Complete duplicates: 0
Policy ID duplicates: 0


In [57]:
# save to S3 bucket and remove
con.to_feather('att11_year.feather')
s3con.write('att11_year.feather')
os.remove('att11_year.feather')

In [58]:
# Attribute 12
df = s3con.read('att12_weight.feather')
con = pd.concat([df, clean_obj], join = 'inner', ignore_index = True)
print('Original size:', df.shape[0])
print('Size after concat:', con.shape[0])
print('Complete duplicates:', con.duplicated().sum())
print('Policy ID duplicates:', con.duplicated('idPolita').sum())

Original size: 27598519
Size after concat: 27601693
Complete duplicates: 0
Policy ID duplicates: 0


In [59]:
# save to S3 bucket and remove
con.to_feather('att12_weight.feather')
s3con.write('att12_weight.feather')
os.remove('att12_weight.feather')

## POLICIES

In [62]:
new_pol.head()

Unnamed: 0,ID,tip,data_Start,data_End,BM,Prima,valuta,sumaAsig,factorReducere,factorMajorare,...,NULL.14,NULL.15,NULL.16,NULL.17,NULL.18,NULL.19,NULL.20,NULL.21,NULL.22,NULL.23
0,32972,RCA,2012-01-03,2012-07-02,B0,186.0,RON,0,Fara,,...,,,,,,,,,,
1,2272279,RCA,2015-01-24,2016-01-23,B4,260.17,RON,0,Pensionar,,...,,,,,,,,,,
2,2356008,RCA,2015-03-05,2016-03-04,B2,436.2,RON,0,Fara,,...,,,,,,,,,,
3,2465166,RCA,2015-04-06,2015-05-05,B0,70.85,RON,0,Fara,,...,,,,,,,,,,
4,2567957,RCA,2015-05-08,2016-05-07,B7,263.38,RON,0,Rural,,...,,,,,,,,,,


In [70]:
print('Row count', new_pol.shape[0])
print('Complete duplicates', new_pol.duplicated().sum())
print('Policy ID duplicates', new_pol.duplicated('ID').sum())
print('Nans', new_pol.duplicated().sum())

Row count 3175
Complete duplicates 0
Policy ID duplicates 0
Nans 0


In [72]:
new_pol.columns

Index(['ID', 'tip', 'data_Start', 'data_End', 'BM', 'Prima', 'valuta',
       'sumaAsig', 'factorReducere', 'factorMajorare', 'data_emitere',
       'nrRate', 'NULL', 'NULL.1', 'NULL.2', 'NULL.3', 'NULL.4', 'NULL.5',
       'NULL.6', 'NULL.7', 'NULL.8', 'NULL.9', 'NULL.10', 'NULL.11', 'NULL.12',
       'NULL.13', 'NULL.14', 'NULL.15', 'NULL.16', 'NULL.17', 'NULL.18',
       'NULL.19', 'NULL.20', 'NULL.21', 'NULL.22', 'NULL.23'],
      dtype='object')

In [73]:
clean_pol = (
    new_pol
    .filter(items = ['ID', 'data_Start', 'data_End', 'BM', 'data_emitere','nrRate'])
    .rename(columns = {'ID':'idPolita', 'BM':'clasaBM'}) # to have same col heads as the original tables
    .query("idPolita > 2230446") # policies with smaller IDs are from before 2015
)

clean_pol = pdc.downcast(clean_pol)
clean_pol.head()

Unnamed: 0,idPolita,data_Start,data_End,clasaBM,data_emitere,nrRate
1,2272279,2015-01-24,2016-01-23,B4,2015-01-21,0
2,2356008,2015-03-05,2016-03-04,B2,2015-03-04,0
3,2465166,2015-04-06,2015-05-05,B0,2015-04-06,0
4,2567957,2015-05-08,2016-05-07,B7,2015-05-07,0
5,2691786,2015-06-18,2016-06-17,B12,2015-06-17,0


In [78]:
# Attribute 13
df = s3con.read('att13_bonus_malus.feather')
con = pd.concat([df, clean_pol], join = 'inner', ignore_index = True)
print('Original size:', df.shape[0])
print('Size after concat:', con.shape[0])
print('Complete duplicates:', con.duplicated().sum())
print('Policy ID duplicates:', con.duplicated('idPolita').sum())

Original size: 27598532
Size after concat: 27601706
Complete duplicates: 0
Policy ID duplicates: 0


In [80]:
# save to S3 bucket and remove
con.to_feather('att13_bonus_malus.feather')
s3con.write('att13_bonus_malus.feather')
os.remove('att13_bonus_malus.feather')

In [81]:
# Attribute 14
df = s3con.read('att14_no_rates.feather')
con = pd.concat([df, clean_pol], join = 'inner', ignore_index = True)
print('Original size:', df.shape[0])
print('Size after concat:', con.shape[0])
print('Complete duplicates:', con.duplicated().sum())
print('Policy ID duplicates:', con.duplicated('idPolita').sum())

Original size: 27598532
Size after concat: 27601706
Complete duplicates: 0
Policy ID duplicates: 0


In [84]:
# save to S3 bucket and remove
con.to_feather('att14_no_rates.feather')
s3con.write('att14_no_rates.feather')
os.remove('att14_no_rates.feather')

### Exposures

First, create a table with the maturity and yearly exposures for the new policies.

In [4]:
# convert date columns to a date format
new_pol['data_Start'] = [datetime.strptime(new_pol.data_Start.iloc[x], "%Y-%m-%d") for x in range(len(new_pol.data_Start))]
new_pol['data_End'] = [datetime.strptime(new_pol.data_End.iloc[x], "%Y-%m-%d") for x in range(len(new_pol.data_End))]

In [5]:
# create a new, clean policies table
# create new columns for maturity and exposures
# NOTE: for exposures spanning 2 years I will first only add the exposure of the star year
clean_pol = (
    new_pol
    .filter(items = ['ID', 'data_Start', 'data_End'])
    .rename(columns = {'ID':'idPolita', 'data_Start':'dataStart', 
                        'data_End':'dataEnd'}) # to have same col heads as the original tables
    .assign(maturity = [round(((new_pol.data_End.iloc[x] - new_pol.data_Start.iloc[x]).days)/30.5) 
                        for x in range(new_pol.shape[0])],

            exp_2015 = [0 if new_pol.data_Start.iloc[x].year != 2015 
                        else ((new_pol.data_End.iloc[x] - new_pol.data_Start.iloc[x]).days)/365 if new_pol.data_End.iloc[x].year == 2015
                        else ((datetime.strptime('2015-12-31', "%Y-%m-%d") - new_pol.data_Start.iloc[x]).days)/365
                        for x in range(new_pol.shape[0])],

            exp_2016 = [0 if new_pol.data_Start.iloc[x].year != 2016
                        else ((new_pol.data_End.iloc[x] - new_pol.data_Start.iloc[x]).days)/365 if new_pol.data_End.iloc[x].year == 2016
                        else ((datetime.strptime('2016-12-31', "%Y-%m-%d") - new_pol.data_Start.iloc[x]).days)/365
                        for x in range(new_pol.shape[0])],

            exp_2017 = [0 if new_pol.data_Start.iloc[x].year != 2017
                        else ((new_pol.data_End.iloc[x] - new_pol.data_Start.iloc[x]).days)/365 if new_pol.data_End.iloc[x].year == 2017
                        else ((datetime.strptime('2017-12-31', "%Y-%m-%d") - new_pol.data_Start.iloc[x]).days)/365
                        for x in range(new_pol.shape[0])],

            exp_2018 = [0 if new_pol.data_Start.iloc[x].year != 2018
                        else ((new_pol.data_End.iloc[x] - new_pol.data_Start.iloc[x]).days)/365 if new_pol.data_End.iloc[x].year == 2018
                        else ((datetime.strptime('2018-12-31', "%Y-%m-%d") - new_pol.data_Start.iloc[x]).days)/365
                        for x in range(new_pol.shape[0])],

            exp_2019 = [0 if new_pol.data_Start.iloc[x].year != 2019
                        else ((new_pol.data_End.iloc[x] - new_pol.data_Start.iloc[x]).days)/365 if new_pol.data_End.iloc[x].year == 2019
                        else ((datetime.strptime('2019-12-31', "%Y-%m-%d") - new_pol.data_Start.iloc[x]).days)/365
                        for x in range(new_pol.shape[0])],

            exp_2020 = [0 if new_pol.data_Start.iloc[x].year != 2020
                        else ((new_pol.data_End.iloc[x] - new_pol.data_Start.iloc[x]).days)/365 if new_pol.data_End.iloc[x].year == 2020
                        else ((datetime.strptime('2020-12-31', "%Y-%m-%d") - new_pol.data_Start.iloc[x]).days)/365
                        for x in range(new_pol.shape[0])],

            exp_2021 = [0 if new_pol.data_Start.iloc[x].year != 2021
                        else ((new_pol.data_End.iloc[x] - new_pol.data_Start.iloc[x]).days)/365 if new_pol.data_End.iloc[x].year == 2021
                        else ((datetime.strptime('2021-12-31', "%Y-%m-%d") - new_pol.data_Start.iloc[x]).days)/365
                        for x in range(new_pol.shape[0])]                 
            )
    .query("idPolita > 2230446") # policies with smaller IDs are from before 2015
            )
clean_pol.reset_index(drop = True, inplace = True)
clean_pol.head()

Unnamed: 0,idPolita,dataStart,dataEnd,maturity,exp_2015,exp_2016,exp_2017,exp_2018,exp_2019,exp_2020,exp_2021
0,2272279,2015-01-24,2016-01-23,12,0.934247,0.0,0.0,0.0,0.0,0.0,0.0
1,2356008,2015-03-05,2016-03-04,12,0.824658,0.0,0.0,0.0,0.0,0.0,0.0
2,2465166,2015-04-06,2015-05-05,1,0.079452,0.0,0.0,0.0,0.0,0.0,0.0
3,2567957,2015-05-08,2016-05-07,12,0.649315,0.0,0.0,0.0,0.0,0.0,0.0
4,2691786,2015-06-18,2016-06-17,12,0.536986,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# for policies spanning 2 years, the part from the second year will be added here
clean_pol.exp_2016 = clean_pol.exp_2016 + [((clean_pol.dataEnd.iloc[x] - datetime.strptime('2016-01-01', "%Y-%m-%d")).days)/365 
        if (clean_pol.dataEnd.iloc[x].year == 2016) & (clean_pol.dataStart.iloc[x].year != 2016) 
        else 0 for x in range(clean_pol.shape[0])]

clean_pol.exp_2017 = clean_pol.exp_2017 + [((clean_pol.dataEnd.iloc[x] - datetime.strptime('2017-01-01', "%Y-%m-%d")).days)/365 
        if (clean_pol.dataEnd.iloc[x].year == 2017) & (clean_pol.dataStart.iloc[x].year != 2017) 
        else 0 for x in range(clean_pol.shape[0])]

clean_pol.exp_2018 = clean_pol.exp_2018 + [((clean_pol.dataEnd.iloc[x] - datetime.strptime('2018-01-01', "%Y-%m-%d")).days)/365 
        if (clean_pol.dataEnd.iloc[x].year == 2018) & (clean_pol.dataStart.iloc[x].year != 2018) 
        else 0 for x in range(clean_pol.shape[0])]

clean_pol.exp_2019 = clean_pol.exp_2019 + [((clean_pol.dataEnd.iloc[x] - datetime.strptime('2019-01-01', "%Y-%m-%d")).days)/365 
        if (clean_pol.dataEnd.iloc[x].year == 2019) & (clean_pol.dataStart.iloc[x].year != 2019) 
        else 0 for x in range(clean_pol.shape[0])]

clean_pol.exp_2020 = clean_pol.exp_2020 + [((clean_pol.dataEnd.iloc[x] - datetime.strptime('2020-01-01', "%Y-%m-%d")).days)/365 
        if (clean_pol.dataEnd.iloc[x].year == 2020) & (clean_pol.dataStart.iloc[x].year != 2020) 
        else 0 for x in range(clean_pol.shape[0])]

clean_pol.exp_2021 = clean_pol.exp_2021 + [((clean_pol.dataEnd.iloc[x] - datetime.strptime('2021-01-01', "%Y-%m-%d")).days)/365 
        if (clean_pol.dataEnd.iloc[x].year == 2021) & (clean_pol.dataStart.iloc[x].year != 2021) 
        else 0 for x in range(clean_pol.shape[0])]

In [7]:
clean_pol.sample(10)

Unnamed: 0,idPolita,dataStart,dataEnd,maturity,exp_2015,exp_2016,exp_2017,exp_2018,exp_2019,exp_2020,exp_2021
1240,15291990,2019-05-03,2020-05-02,12,0.0,0.0,0.0,0.0,0.663014,0.334247,0.0
127,4815538,2016-12-02,2017-12-01,12,0.0,0.079452,0.915068,0.0,0.0,0.0,0.0
2460,23953684,2020-11-24,2021-11-23,12,0.0,0.0,0.0,0.0,0.0,0.10137,0.893151
2904,27252491,2021-05-05,2022-05-04,12,0.0,0.0,0.0,0.0,0.0,0.0,0.657534
586,10508101,2018-05-18,2019-05-17,12,0.0,0.0,0.0,0.621918,0.372603,0.0,0.0
952,13491605,2019-01-01,2019-12-31,12,0.0,0.0,0.0,0.0,0.99726,0.0,0.0
1870,19425126,2020-02-11,2021-02-10,12,0.0,0.0,0.0,0.0,0.0,0.887671,0.109589
2561,24725148,2020-12-16,2021-12-15,12,0.0,0.0,0.0,0.0,0.0,0.041096,0.953425
534,10038012,2018-04-27,2019-04-26,12,0.0,0.0,0.0,0.679452,0.315068,0.0,0.0
2050,21232826,2020-05-25,2021-05-24,12,0.0,0.0,0.0,0.0,0.0,0.60274,0.391781


In [8]:
# save as feather
clean_pol = pdc.downcast(clean_pol)
clean_pol.to_feather('leftover_clients_exposures.feather')

Combine the new table with the original one.

In [14]:
exp = s3con.read('polite_exp.feather')
exp.head()

Unnamed: 0,idPolita,dataStart,dataEnd,maturity,exp_2015,exp_2016,exp_2017,exp_2018,exp_2019,exp_2020,exp_2021,exp_2022
0,2230446,2015-01-04,2015-07-03,6,0.4932,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2230447,2015-01-02,2015-07-01,6,0.4932,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2230448,2015-01-04,2016-01-03,12,0.989,0.0055,0.0,0.0,0.0,0.0,0.0,0.0
3,2230449,2015-01-19,2016-01-18,12,0.9479,0.0466,0.0,0.0,0.0,0.0,0.0,0.0
4,2230450,2015-01-02,2016-01-01,12,0.9945,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
exp.drop('exp_2022', axis = 1, inplace=True)

In [6]:
clean_pol = pd.read_feather('leftover_clients_exposures.feather')
exp_com = pd.concat([exp, clean_pol], axis = 0, ignore_index = True)

In [8]:
exp_com = pdc.downcast(exp_com)
exp_com.info()

  uniques = table.unique(values)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27601706 entries, 0 to 27601705
Data columns (total 11 columns):
 #   Column     Dtype   
---  ------     -----   
 0   idPolita   uint32  
 1   dataStart  category
 2   dataEnd    category
 3   maturity   uint8   
 4   exp_2015   category
 5   exp_2016   category
 6   exp_2017   category
 7   exp_2018   category
 8   exp_2019   category
 9   exp_2020   category
 10  exp_2021   category
dtypes: category(9), uint32(1), uint8(1)
memory usage: 605.7 MB


In [23]:
exp_com.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27601706 entries, 0 to 27601705
Data columns (total 11 columns):
 #   Column     Dtype   
---  ------     -----   
 0   idPolita   uint32  
 1   dataStart  category
 2   dataEnd    category
 3   maturity   uint8   
 4   exp_2015   category
 5   exp_2016   category
 6   exp_2017   category
 7   exp_2018   category
 8   exp_2019   category
 9   exp_2020   category
 10  exp_2021   category
dtypes: category(9), uint32(1), uint8(1)
memory usage: 605.7 MB


In [29]:
exp_com.exp_2015 = pd.to_numeric(exp_com.exp_2015, downcast="float")
exp_com.exp_2016 = pd.to_numeric(exp_com.exp_2016, downcast="float")
exp_com.exp_2017 = pd.to_numeric(exp_com.exp_2017, downcast="float")
exp_com.exp_2018 = pd.to_numeric(exp_com.exp_2018, downcast="float")
exp_com.exp_2019 = pd.to_numeric(exp_com.exp_2019, downcast="float")
exp_com.exp_2020 = pd.to_numeric(exp_com.exp_2020, downcast="float")
exp_com.exp_2021 = pd.to_numeric(exp_com.exp_2021, downcast="float")

exp_com.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27601706 entries, 0 to 27601705
Data columns (total 11 columns):
 #   Column     Dtype   
---  ------     -----   
 0   idPolita   uint32  
 1   dataStart  category
 2   dataEnd    category
 3   maturity   uint8   
 4   exp_2015   float32 
 5   exp_2016   float32 
 6   exp_2017   float32 
 7   exp_2018   float32 
 8   exp_2019   float32 
 9   exp_2020   float32 
 10  exp_2021   float32 
dtypes: category(2), float32(7), uint32(1), uint8(1)
memory usage: 974.1 MB


In [30]:
exp_com.to_feather('exposures_com.feather')
s3con.write('exposures_com.feather')
os.remove('exposures_com.feather')