In [1]:
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import os

In [2]:
load_dotenv()

True

In [3]:
clients = pd.read_csv(os.getenv('DATA_PATH') + '/' + 'tj38.csv', encoding='ISO-8859-1', sep=';')
poste =  pd.read_csv(os.getenv('DATA_PATH') + '/' + 'tj24.csv', encoding='ISO-8859-1', sep=';')
pc = pd.read_csv(os.getenv('DATA_PATH') + '/' + 'post_codes.csv', encoding='ISO-8859-1', sep='\t')

## Shapes

In [4]:
clients.shape

(2159265, 7)

In [5]:
poste.shape

(859128, 2)

In [6]:
pc.shape

(51667, 3)

## Info

In [10]:
clients.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2159265 entries, 0 to 2159264
Data columns (total 7 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   COMAX   object
 1   AGE     int64 
 2   COSEXE  object
 3   CTSCPI  object
 4   CESITC  int64 
 5   DDVALE  object
 6   DFVALE  object
dtypes: int64(2), object(5)
memory usage: 115.3+ MB


In [12]:
poste.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 859128 entries, 0 to 859127
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   COMAX   859128 non-null  object
 1   COPOST  859128 non-null  object
dtypes: object(2)
memory usage: 13.1+ MB


In [13]:
pc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51667 entries, 0 to 51666
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   COPOST  51667 non-null  int64  
 1   LAT     51667 non-null  float64
 2   LON     51667 non-null  float64
dtypes: float64(2), int64(1)
memory usage: 1.2 MB


In [16]:
pc['COPOST'] = pc['COPOST'].astype(str)

## Inspect

In [41]:
poste.head(1)

Unnamed: 0,COMAX,COPOST
0,001932f4aa,66420


In [42]:
clients.head(1)

Unnamed: 0,COMAX,AGE,COSEXE,CTSCPI,CESITC,DDVALE,DFVALE
0,6e3a2b9fa1,55,F,4600,1,2013-02-12,9999-01-01


In [7]:
pc.head(1)

Unnamed: 0,COPOST,LAT,LON
0,75000,48.8534,2.3488


## Merge

In [17]:
# concatenate clients with the latitide longtitue data
data = pd.merge(poste, pc, on='COPOST', how='left')

In [18]:
data.head(1)

Unnamed: 0,COMAX,COPOST,LAT,LON
0,001932f4aa,66420,42.7877,3.0366


In [19]:
# concatenate clients with the code post
data = pd.merge(clients, data, on='COMAX', how='left')

In [20]:
data.head()

Unnamed: 0,COMAX,AGE,COSEXE,CTSCPI,CESITC,DDVALE,DFVALE,COPOST,LAT,LON
0,6e3a2b9fa1,55,F,4600,1,2013-02-12,9999-01-01,34170,43.6333,3.9
1,1b44a67f61,41,M,4700,1,2019-05-29,9999-01-01,34490,43.3846,3.1689
2,1b44a67f61,41,M,4700,1,2019-05-29,9999-01-01,34490,43.5091,3.0763
3,1b44a67f61,41,M,4700,1,2019-05-29,9999-01-01,34490,43.4742,3.0863
4,1b44a67f61,41,M,4700,1,2019-05-29,9999-01-01,34490,43.4333,3.1333


In [24]:
data.shape

(854706, 10)

## Corp The Data

In [21]:
# delete duplicated matricules
data = data.drop_duplicates(subset="COMAX")

In [22]:
# in some cases there was an input mistake
# so instead of typing the age they type the
# birth year, so we correct that
data = data.replace(2019, 2021-2019)

In [23]:
# take only the age less or equal to 110 for demographoc statistc reasons
data = data[data['AGE'] <= 110]

In [25]:
# get the list of miss typed data
lol = list()
for i in data['CTSCPI'].unique():
    if not i.isdigit():
        lol.append(i)

# replce the missed data, and miss created one with nan values
data['CTSCPI'].replace(lol, np.nan, inplace=True)

In [26]:
# add the zero, since in the corresponding table of topologies
# it is an unkown category
data['CTSCPI'].replace(0., np.nan, inplace=True)

# delete the rows with nan values
data.dropna(subset=['CTSCPI'], inplace=True)

In [27]:
data["CTSCPI"] = data["CTSCPI"].astype(int)

In [28]:
# we replace the sex with numbers, 0 for M and 1 for F
data['COSEXE'].replace('M', 0, inplace=True)
data['COSEXE'].replace('F', 1, inplace=True)

In [29]:
data.shape

(831367, 10)

In [30]:
data.head()

Unnamed: 0,COMAX,AGE,COSEXE,CTSCPI,CESITC,DDVALE,DFVALE,COPOST,LAT,LON
0,6e3a2b9fa1,55,1,4600,1,2013-02-12,9999-01-01,34170.0,43.6333,3.9
1,1b44a67f61,41,0,4700,1,2019-05-29,9999-01-01,34490.0,43.3846,3.1689
36,7e33583438,74,1,8600,0,2011-12-22,2017-10-16,73100.0,45.695,5.9537
69,b65dd1ba1d,41,0,8400,0,2011-12-22,2017-09-13,,,
70,bdf1f63a98,32,1,3500,1,2017-12-14,9999-01-01,79800.0,46.373,-0.1872


In [31]:
lol = list()

for i in data['COPOST'].unique():
    if not str(i).isdigit():
        lol.append(i)

In [32]:
# replce the missed data, and miss created one with nan values
data['COPOST'].replace(lol, np.nan, inplace=True)
data['LAT'].replace(lol, np.nan, inplace=True)
data['LON'].replace(lol, np.nan, inplace=True)


# delete the rows with nan values
data.dropna(subset=['COPOST'], inplace=True)
data.dropna(subset=['LAT'], inplace=True)
data.dropna(subset=['LON'], inplace=True)

In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 600870 entries, 0 to 9661354
Data columns (total 10 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   COMAX   600870 non-null  object 
 1   AGE     600870 non-null  int64  
 2   COSEXE  600870 non-null  int64  
 3   CTSCPI  600870 non-null  int64  
 4   CESITC  600870 non-null  int64  
 5   DDVALE  600870 non-null  object 
 6   DFVALE  600870 non-null  object 
 7   COPOST  600870 non-null  object 
 8   LAT     600870 non-null  float64
 9   LON     600870 non-null  float64
dtypes: float64(2), int64(4), object(4)
memory usage: 50.4+ MB


In [35]:
data['CTSCPI'] = data['CTSCPI'].astype(str)

In [36]:
data.shape

(600870, 10)

In [37]:
data.head()

Unnamed: 0,COMAX,AGE,COSEXE,CTSCPI,CESITC,DDVALE,DFVALE,COPOST,LAT,LON
0,6e3a2b9fa1,55,1,4600,1,2013-02-12,9999-01-01,34170,43.6333,3.9
1,1b44a67f61,41,0,4700,1,2019-05-29,9999-01-01,34490,43.3846,3.1689
36,7e33583438,74,1,8600,0,2011-12-22,2017-10-16,73100,45.695,5.9537
70,bdf1f63a98,32,1,3500,1,2017-12-14,9999-01-01,79800,46.373,-0.1872
91,bc9f2a4a5c,23,1,8400,0,2019-09-09,2020-07-09,66400,42.4853,2.748


## Save The Data

In [38]:
data.to_csv(os.getenv('CLEANED_DATA_PATH') + '/' + 'TJ38.csv', index=False, sep="\t")