In [1]:
# Load modules for data manipulation
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import os

In [2]:
load_dotenv()

True

# Read and split data

In [4]:
# clt = pd.read_csv(os.getenv('CLEANED_DATA_PATH') + '/' + 'TJ38.csv', encoding='ISO-8859-1', sep='\t', usecols=['COMAX'])
data = pd.read_csv(os.getenv('META_DATA_PATH') + '/' + 'tr35-01.csv', encoding='ISO-8859-1', sep=';', low_memory=False)

In [5]:
# Get only the physical clients, that are libaled by 1
data = data[data['CTPE'] == 1]

In [6]:
for file in [str(i).zfill(2) for i in range(2, 13)]:
    print('tr35-{}.csv'.format(file))
    tr = pd.read_csv(os.getenv('META_DATA_PATH') + '/' + 'tr35-{}.csv'.format(file), encoding='ISO-8859-1', sep=';', low_memory=False)
    tr = tr[tr['CTPE'] == 1]
    data = pd.concat([data, tr])
    
    del tr

tr35-02.csv
tr35-03.csv
tr35-04.csv
tr35-05.csv
tr35-06.csv
tr35-07.csv
tr35-08.csv
tr35-09.csv
tr35-10.csv
tr35-11.csv
tr35-12.csv


In [7]:
# delete duplicated matricules
data = data.drop_duplicates(subset="COMAX")

In [8]:
# take only the age less or equal to 110 for demographoc statistc reasons
data = data[data['QTAGCL'] <= 110]

In [9]:
# we replace the sex with numbers, 0 for M and 1 for F
data['COSEXE'].replace('M', 0, inplace=True)
data['COSEXE'].replace('F', 1, inplace=True)

In [10]:
# The non predictif data, all zeros data, codevi data 
cols = ['QCPRPO', 'QTEQUI', 'PSGASS', 'MTUTCT', 'MTAUCT', 'QCTFEFCT', 'QCFEFMLT', 'QCTFEFXFC', 'QCFEISAS',
        'QCFEINTR', 'QCFETRES', 'QFAMESGR', 'QCPEAICF', 'QCPEAIBQ', 'QCPEAIFC', 'QCPEAIFL', 'QCPEAITR',
        'QCPEAIIR', 'QCPEAIAP', 'QCPEAIDI', 'QFPEAISR', 'QCPEAIBP', 'COETB', 'COPFCI', 'COEM',
        'COESPF', 'CTPE', 'COGRRB', 'COSGPA', 'COCINS', 'COPOAG', 'CORIPA', 'CESITC', 'CERCPT',
        'CERCPE', 'CERCEP', 'CERCPS', 'CERCPL', 'CERCPC', 'CERCPP', 'COHAVI', 'QCLDD', 'QCCSL', 'QCLIVJ',
        'QCLEP', 'QCLIVR', 'QCCONT', 'MTELDD', 'MTELIJ', 'QCBPFA']

In [11]:
# delete all the none relative data
for col in cols:
    try:
        del data[col]
    except:
        print(col)

COETB
CESITC
COHAVI


# Add the post data

In [12]:
# Read the post codes data
poste =  pd.read_csv(os.getenv('DATA_PATH') + '/' + 'tj24.csv', encoding='ISO-8859-1', sep=';')
pc = pd.read_csv(os.getenv('DATA_PATH') + '/' + 'post_codes.csv', encoding='ISO-8859-1', sep='\t')

In [13]:
# It was int, convert it to str
pc['COPOST'] = pc['COPOST'].astype(str)

In [14]:
# concatenate clients with the latitide longtitue data
temp = pd.merge(poste, pc, on='COPOST', how='left')
data = pd.merge(data, temp, on='COMAX', how='left')

In [15]:
# Clean the dataset
del data['COPOST_y']
data.rename(columns={'COPOST_x': 'COPOST'}, inplace=True)

In [16]:
# get the list of miss typed data, CTSCPI
lol = list()
for i in data['CTSCPI'].unique():
    if not i.isdigit():
        lol.append(i)
        
# replce the missed data, and miss created one with nan values
data['CTSCPI'].replace(lol, np.nan, inplace=True)

In [17]:
# get the list of miss typed data, for COPOST
lol = list()
for i in data['COPOST'].unique():
    if not str(i).isdigit():
        lol.append(i)
        
# replce the missed data, and miss created one with nan values
data['COPOST'].replace(lol, np.nan, inplace=True)
data['LAT'].replace(lol, np.nan, inplace=True)
data['LON'].replace(lol, np.nan, inplace=True)

In [18]:
# delete duplicated matricules
data = data.drop_duplicates(subset="COMAX")

In [19]:
# in some cases there was an input mistake
# so instead of typing the age they type the
# birth year, so we correct that
data['QTAGCL'] = data['QTAGCL'].replace(2019, 2021-2019)

## Data Info

In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 580778 entries, 0 to 2773641
Columns: 116 entries, COMAX to LON
dtypes: float64(70), int64(1), object(45)
memory usage: 518.4+ MB


In [25]:
data['COMAX'].nunique()

580778

# Save the data

In [26]:
# save into csv
data.to_csv(os.getenv('CLEANED_DATA_PATH') + '/' + 'TR35.csv', index=False, sep="\t")

In [21]:
# consult final data set shape
data.shape

(580778, 116)

In [22]:
# no data is missing
for i in zip(data.isna().any().index, data.isna().any().values):
    if i[1]:
        print(i)

('COPOST', True)
('CTSCPI', True)
('MTCDIM', True)
('MTRECD', True)
('MTRETT', True)
('QCPRDI', True)
('QCPRCP', True)
('QCPRQU', True)
('QCPRMO', True)
('QCPRFI', True)
('QCPRTR', True)
('QCPRAS', True)
('QCPRPR', True)
('MTECEL', True)
('MTEPEL', True)
('MTECSL', True)
('MTEPEP', True)
('MTELEP', True)
('MTECIM', True)
('MTECCS', True)
('MTECEQ', True)
('MTECTR', True)
('MTEEML', True)
('MTEASV', True)
('MTECTO', True)
('MTECPA', True)
('MTE19D', True)
('MTE21D', True)
('MTEEMC', True)
('MTESOC', True)
('MCTOTA', True)
('MCTOTE', True)
('QTCLIF', True)
('QCPREV', True)
('QCIARD', True)
('MTFLPR', True)
('MTFLCO', True)
('MTRSFI', True)
('MTRSMO', True)
('QCFPRORB', True)
('QFPROSGR', True)
('LAT', True)
('LON', True)
