In [1]:
import pandas as pd

# Load client and invoice data
client_df = pd.read_csv("Data/client.csv")
invoice_df = pd.read_csv("Data/invoice.csv")

# Preview
print(client_df.head())
print(invoice_df.head())

   region        date  dis    id  catg  target
0     101  31/12/1994   60     0    11       0
1     107   29/5/2002   69     1    11       0
2     301   13/3/1986   62    10    11       0
3     105   11/7/1996   69   100    11       0
4     303  14/10/2014   62  1000    11       0
   id        date  tarif_type  counter_statue  reading_remarque  \
0   0   24/3/2014          11               0                 8   
1   0   29/3/2013          11               0                 6   
2   0   23/3/2015          11               0                 8   
3   0   13/7/2015          11               0                 8   
4   0  17/11/2016          11               0                 9   

   consommation_level_4  months_number counter_type  counter_coefficient  \
0                     0              4         ELEC                    1   
1                     0              4         ELEC                    1   
2                     0              4         ELEC                    1   
3          

Now, we parse dates as datetime objects. 
This ensures date data is processed correctly afterwards.

In [2]:
# Convert date columns to datetime
client_df['date'] = pd.to_datetime(client_df['date'])
invoice_df['date'] = pd.to_datetime(invoice_df['date'])

print(client_df['date'].head())
print(invoice_df['date'].head())

  client_df['date'] = pd.to_datetime(client_df['date'])
  invoice_df['date'] = pd.to_datetime(invoice_df['date'])


0   1994-12-31
1   2002-05-29
2   1986-03-13
3   1996-07-11
4   2014-10-14
Name: date, dtype: datetime64[ns]
0   2014-03-24
1   2013-03-29
2   2015-03-23
3   2015-07-13
4   2016-11-17
Name: date, dtype: datetime64[ns]


Now we will work on the client dataset individually, checking for missing data and duplicates. 

In [3]:
# Check the columns and data types
print(client_df.info())

# Check for duplicates
print(client_df.duplicated().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21652 entries, 0 to 21651
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   region  21652 non-null  int64         
 1   date    21652 non-null  datetime64[ns]
 2   dis     21652 non-null  int64         
 3   id      21652 non-null  int64         
 4   catg    21652 non-null  int64         
 5   target  21652 non-null  int64         
dtypes: datetime64[ns](1), int64(5)
memory usage: 1015.1 KB
None
             region                           date           dis  \
count  21652.000000                          21652  21652.000000   
mean     204.851284  2004-04-01 03:40:36.135229952     63.505034   
min      101.000000            1977-02-05 00:00:00     60.000000   
25%      101.000000            1996-07-11 00:00:00     62.000000   
50%      107.000000            2008-03-06 00:00:00     62.000000   
75%      307.000000            2013-05-29 00:00:00     69.000000   

Summary of our client dataset:

There is 1 column of date-time data and 5 columns of integer data type, including our target column.There are no missing value at all. There are no duplicate rows of data either.

Next, we will correct all data types to ensure our catergorical data are read accordingly, preventing misinterpretation of our categorical data.

In [5]:
# Convert categorical columns to category type 
columns_to_convert = ['region','dis','id','catg','target']
client_df[columns_to_convert] = client_df[columns_to_convert].astype('category')

# Checking that our ategorical columns have been converted
print(client_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21652 entries, 0 to 21651
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   region  21652 non-null  category      
 1   date    21652 non-null  datetime64[ns]
 2   dis     21652 non-null  category      
 3   id      21652 non-null  category      
 4   catg    21652 non-null  category      
 5   target  21652 non-null  category      
dtypes: category(5), datetime64[ns](1)
memory usage: 982.5 KB
None
