Started with pandas.
Imported client and invoice datasets.
Check first few rows of these datasets.

In [2]:
import pandas as pd

# Load client and invoice data
client_df = pd.read_csv("Data/client.csv")
invoice_df = pd.read_csv("Data/invoice.csv")

# Preview
print(client_df.head())
print(invoice_df.head())

   region        date  dis    id  catg  target
0     101  31/12/1994   60     0    11       0
1     107   29/5/2002   69     1    11       0
2     301   13/3/1986   62    10    11       0
3     105   11/7/1996   69   100    11       0
4     303  14/10/2014   62  1000    11       0
   id        date  tarif_type  counter_statue  reading_remarque  \
0   0   24/3/2014          11               0                 8   
1   0   29/3/2013          11               0                 6   
2   0   23/3/2015          11               0                 8   
3   0   13/7/2015          11               0                 8   
4   0  17/11/2016          11               0                 9   

   consommation_level_4  months_number counter_type  counter_coefficient  \
0                     0              4         ELEC                    1   
1                     0              4         ELEC                    1   
2                     0              4         ELEC                    1   
3          

Now, we parse dates as datetime objects. 
This ensures date data is processed correctly afterwards.

In [3]:
# Convert date columns to datetime
client_df['date'] = pd.to_datetime(client_df['date'])
invoice_df['date'] = pd.to_datetime(invoice_df['date'])

print(client_df['date'].head())
print(invoice_df['date'].head())

0   1994-12-31
1   2002-05-29
2   1986-03-13
3   1996-07-11
4   2014-10-14
Name: date, dtype: datetime64[ns]
0   2014-03-24
1   2013-03-29
2   2015-03-23
3   2015-07-13
4   2016-11-17
Name: date, dtype: datetime64[ns]


  client_df['date'] = pd.to_datetime(client_df['date'])
  invoice_df['date'] = pd.to_datetime(invoice_df['date'])


We intend to classify clients, so we tried to summarise invoice data by client. For each client, we computed rough statistics such as the mean and sum for our focus on consumption levels.

In [14]:
# Aggregating invoice data by client 'id'
invoice_by_id = invoice_df.groupby('id').agg({
    'consommation_level_1': ['sum', 'mean', 'max', 'std'],
    'consommation_level_2': ['sum', 'mean', 'max', 'std'],
    'consommation_level_3': ['sum', 'mean', 'max', 'std'],
    'consommation_level_4': ['sum', 'mean', 'max', 'std'],
    'date': ['min', 'max'],     # Add earliest and latest invoice date
    'counter_statue': 'count',  # How many invoices per client
}).reset_index()

invoice_by_id.columns = ['_'.join(col).strip() if col[1] else col[0] for col in invoice_by_id.columns.values]

print(invoice_by_id.head())

invoice_by_id.to_csv("Data/invoice_by_id.csv", index=False)

# Preview the data
print(invoice_by_id.head())

# Rename columns after aggregation
#invoice_agg.columns = ['id', 
#                       'cons_level_1_sum', 'cons_level_1_mean', 'cons_level_1_max',
#                       'cons_level_2_sum', 'cons_level_2_mean', 'cons_level_2_max',
#                       'cons_level_3_sum', 'cons_level_3_mean', 'cons_level_3_max',
#                       'cons_level_4_sum', 'cons_level_4_mean', 'cons_level_4_max',
#                       'num_invoices']

# Merge client_df and invoice_agg on 'id'
#merged_df = pd.merge(client_df, invoice_agg, on='id')

# Preview merged data
#print(merged_df.head())


    id  consommation_level_1_sum  consommation_level_1_mean  \
0    0                     12334                 352.400000   
1    1                     20629                 557.540541   
2   10                     14375                 798.611111   
3   12                      5724                 357.750000   
4  100                        24                   1.200000   

   consommation_level_1_max  consommation_level_1_std  \
0                      1200                310.343472   
1                      1207                197.935960   
2                      2400                513.841374   
3                       925                232.908995   
4                        15                  3.607011   

   consommation_level_2_sum  consommation_level_2_mean  \
0                       370                  10.571429   
1                         0                   0.000000   
2                       682                  37.888889   
3                      1740                 10