Started with pandas.
Imported the invoice dataset.
Check first few rows of the dataset.

In [2]:
import pandas as pd

# Load client and invoice data
invoice_df = pd.read_csv("Data/invoice.csv")

# Preview
print(invoice_df.head())

   id        date  tarif_type  counter_statue  reading_remarque  \
0   0   24/3/2014          11               0                 8   
1   0   29/3/2013          11               0                 6   
2   0   23/3/2015          11               0                 8   
3   0   13/7/2015          11               0                 8   
4   0  17/11/2016          11               0                 9   

   consommation_level_4  months_number counter_type  counter_coefficient  \
0                     0              4         ELEC                    1   
1                     0              4         ELEC                    1   
2                     0              4         ELEC                    1   
3                     0              4         ELEC                    1   
4                     0             12         ELEC                    1   

   consommation_level_1  consommation_level_2  consommation_level_3  
0                    82                     0                     0  


Now, we parse dates as datetime objects. 
This ensures date data is processed correctly afterwards.

In [3]:
# Convert date columns to datetime
invoice_df['date'] = pd.to_datetime(invoice_df['date'], dayfirst=True)

print(invoice_df['date'].head())

0   2014-03-24
1   2013-03-29
2   2015-03-23
3   2015-07-13
4   2016-11-17
Name: date, dtype: datetime64[ns]


We intend to classify clients, so we tried to summarise invoice data by client. For each client, we computed rough statistics such as the mean and standard deviation for our focus on consumption levels.

In [4]:
# Aggregating invoice data by client 'id'
invoice_by_id = invoice_df.groupby('id').agg({
    'consommation_level_1': ['sum', 'mean', 'max', 'std'],
    'consommation_level_2': ['sum', 'mean', 'max', 'std'],
    'consommation_level_3': ['sum', 'mean', 'max', 'std'],
    'consommation_level_4': ['sum', 'mean', 'max', 'std'],
    'counter_statue': 'count',  # How many invoices per client
}).reset_index()

invoice_by_id.columns = ['_'.join(col).strip() if col[1] else col[0] for col in invoice_by_id.columns.values]

# Rename columns after aggregation
invoice_by_id.columns = ['id', 
                       'cons_level_1_sum', 'cons_level_1_mean', 'cons_level_1_max', 'cons_level_1_std',
                       'cons_level_2_sum', 'cons_level_2_mean', 'cons_level_2_max', 'cons_level_2_std',
                       'cons_level_3_sum', 'cons_level_3_mean', 'cons_level_3_max', 'cons_level_3_std',
                       'cons_level_4_sum', 'cons_level_4_mean', 'cons_level_4_max', 'cons_level_4_std',
                       'num_invoices']

# Preview the data
print(invoice_by_id.head())

# Save as new file
invoice_by_id.to_csv("Data/invoice_cleaned.csv", index=False)


    id  cons_level_1_sum  cons_level_1_mean  cons_level_1_max  \
0    0             12334         352.400000              1200   
1    1             20629         557.540541              1207   
2   10             14375         798.611111              2400   
3   12              5724         357.750000               925   
4  100                24           1.200000                15   

   cons_level_1_std  cons_level_2_sum  cons_level_2_mean  cons_level_2_max  \
0        310.343472               370          10.571429               186   
1        197.935960                 0           0.000000                 0   
2        513.841374               682          37.888889               682   
3        232.908995              1740         108.750000               422   
4          3.607011                 0           0.000000                 0   

   cons_level_2_std  cons_level_3_sum  cons_level_3_mean  cons_level_3_max  \
0         43.568935                 0              0.000      

Now we merge the 2 cleaned data together based on client id.

In [7]:
client_df = pd.read_csv("Data/client.csv")
invoice_df = pd.read_csv("Data/invoice_cleaned.csv")
merged_df = pd.merge(client_df, invoice_df, on='id')

print(merged_df.head())

merged_df.to_csv("Data/merged_data.csv", index=False)

   region        date  dis    id  catg  target  cons_level_1_sum  \
0     101  31/12/1994   60     0    11       0             12334   
1     107   29/5/2002   69     1    11       0             20629   
2     301   13/3/1986   62    10    11       0             14375   
3     105   11/7/1996   69   100    11       0                24   
4     303  14/10/2014   62  1000    11       0              9292   

   cons_level_1_mean  cons_level_1_max  cons_level_1_std  ...  \
0         352.400000              1200        310.343472  ...   
1         557.540541              1207        197.935960  ...   
2         798.611111              2400        513.841374  ...   
3           1.200000                15          3.607011  ...   
4         663.714286               800        224.831365  ...   

   cons_level_2_std  cons_level_3_sum  cons_level_3_mean  cons_level_3_max  \
0         43.568935                 0           0.000000                 0   
1          0.000000                 0       