In [1]:
import numpy as np
import pandas
import matplotlib.pyplot as plt

In [2]:
df = pandas.read_csv("data/tj38.csv", encoding="ISO-8859-1", sep=";")

## Inspect the data

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2159265 entries, 0 to 2159264
Data columns (total 7 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   COMAX   object
 1   AGE     int64 
 2   COSEXE  object
 3   CTSCPI  object
 4   CESITC  int64 
 5   DDVALE  object
 6   DFVALE  object
dtypes: int64(2), object(5)
memory usage: 115.3+ MB


In [5]:
df

Unnamed: 0,COMAX,AGE,COSEXE,CTSCPI,CESITC,DDVALE,DFVALE
0,6e3a2b9fa1,55,F,4600,1,2013-02-12,9999-01-01
1,1b44a67f61,41,M,4700,1,2019-05-29,9999-01-01
2,1b44a67f61,41,M,4700,0,2019-02-07,2019-03-21
3,1b44a67f61,41,M,4700,0,2019-02-04,2019-02-05
4,1b44a67f61,41,M,4700,0,2019-03-21,2019-05-29
...,...,...,...,...,...,...,...
2159260,156b68c630,67,M,5400,1,2020-05-15,9999-01-01
2159261,b8469fdb86,59,F,5200,0,2011-12-22,2017-09-13
2159262,d006611100,81,M,7300,1,2011-04-11,9999-01-01
2159263,2ac6fa94b8,46,F,4300,1,2020-06-05,9999-01-01


## Corp the data

In [6]:
# delete duplicated matricules
df = df.drop_duplicates(subset="COMAX")

In [7]:
df["COMAX"].count()

857419

In [8]:
# in some cases there was an input mistake
# so instead of typing the age they type the
# birth year, so we correct that
df = df.replace(2019, 2021-2019)

In [9]:
df[df.AGE > 100].AGE.unique()

array([101, 111, 103, 107, 121, 106, 105, 104, 108, 114, 110, 115, 102,
       112, 109, 118, 113, 122, 138, 119, 124, 116, 127, 117, 120, 125,
       123, 136, 137, 129, 131, 126])

In [10]:
# get the list of miss typed data
lol = list()
for i in df['CTSCPI'].unique():
    if not i.isdigit():
        lol.append(i)

# replce the missed data, and miss created one with nan values
df['CTSCPI'].replace(lol, np.nan, inplace=True)

In [11]:
# add thie zero, since in the corresponding table of topologies
# it is an unkown category
df['CTSCPI'].replace(0., np.nan, inplace=True)

# delete the rows with nan values
df.dropna(subset=['CTSCPI'], inplace=True)

In [12]:
df["CTSCPI"] = df["CTSCPI"].astype(int)
df.CTSCPI.unique()

array([4600, 4700, 8600, 8400, 3500, 7600, 6200, 8500, 6300, 4300, 6700,
       5400, 2300, 3700, 5500, 4200, 4500, 6400, 3100, 3300, 4800, 5200,
       7300, 5600, 3400, 2200, 2100, 5300, 6900, 1100, 3800, 7200, 7400,
       6800, 8100, 7800, 7700, 7100, 6500, 7500, 4400, 1200, 8200, 3600,
       6600, 1300, 6100, 5100, 8300, 4100, 1000, 3200])

## Save only the data that will serve us

In [13]:
# we replace the sex with numbers, 0 for M and 1 for F
df['COSEXE'].replace('M', 0, inplace=True)
df['COSEXE'].replace('F', 1, inplace=True)

In [14]:
df.head()

Unnamed: 0,COMAX,AGE,COSEXE,CTSCPI,CESITC,DDVALE,DFVALE
0,6e3a2b9fa1,55,1,4600,1,2013-02-12,9999-01-01
1,1b44a67f61,41,0,4700,1,2019-05-29,9999-01-01
6,7e33583438,74,1,8600,0,2011-12-22,2017-10-16
9,b65dd1ba1d,41,0,8400,0,2011-12-22,2017-09-13
10,bdf1f63a98,32,1,3500,1,2017-12-14,9999-01-01


In [16]:
df[["COMAX", "AGE", "COSEXE", "CTSCPI"]].to_csv("physical-clients.csv", index=False, sep="\t")