In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [36]:
df = pd.read_csv("dataset/DQ-dataset.csv", sep="\t",index_col=0, decimal=",")
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 456644 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   BasketID         456644 non-null  int64  
 1   BasketDate       456644 non-null  object 
 2   Sale             456644 non-null  float64
 3   CustomerID       456644 non-null  object 
 4   CustomerCountry  456644 non-null  object 
 5   ProdID           456644 non-null  object 
 6   ProdDescr        456644 non-null  object 
 7   Qta              456644 non-null  int64  
dtypes: float64(1), int64(2), object(5)
memory usage: 31.4+ MB


# Working on month

In [37]:
month = pd.DatetimeIndex(df['BasketDate']).month
df["Month"] = month
df["Total_Spent"] = df["Qta"]*df["Sale"]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 456644 entries, 0 to 541908
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   BasketID         456644 non-null  int64  
 1   BasketDate       456644 non-null  object 
 2   Sale             456644 non-null  float64
 3   CustomerID       456644 non-null  object 
 4   CustomerCountry  456644 non-null  object 
 5   ProdID           456644 non-null  object 
 6   ProdDescr        456644 non-null  object 
 7   Qta              456644 non-null  int64  
 8   Month            456644 non-null  int64  
 9   Total_Spent      456644 non-null  float64
dtypes: float64(2), int64(3), object(5)
memory usage: 38.3+ MB


## Quantità media per mese

In [38]:
print(df.groupby(["CustomerID","Month"])["Qta"].mean())

CustomerID  Month
10023N      8        15.760000
10202N      7         1.000000
10353N      4         1.708333
10370N      12        2.433099
10374N      10        1.959184
                       ...    
99606N      10        6.833333
99733N      8         1.000000
9983N       4         8.000000
99958N      1         1.750000
99983N      9         2.072727
Name: Qta, Length: 14003, dtype: float64


## Total spent per month

In [39]:
print(df.groupby(["CustomerID","Month"])["Total_Spent"].mean())

CustomerID  Month
10023N      8        62.100000
10202N      7        12.750000
10353N      4         7.724583
10370N      12        9.789754
10374N      10        6.314490
                       ...    
99606N      10       37.026667
99733N      8         3.106667
9983N       4        20.000000
99958N      1         9.345417
99983N      9         5.998273
Name: Total_Spent, Length: 14003, dtype: float64


### Test

In [40]:
print("CustomerID == 16875")
print(df[df["CustomerID"]=="16875.0"].groupby("Month")["Qta"].mean())
print(df[df["CustomerID"]=="16875.0"].groupby("Month")["Total_Spent"].mean())

CustomerID == 16875
Month
1      6.363636
2     10.458333
7      7.461538
10    10.962264
Name: Qta, dtype: float64
Month
1     16.763636
2     18.178750
7     25.623846
10    18.062830
Name: Total_Spent, dtype: float64


In [41]:
result = []
for customer in df.CustomerID.unique():
    result.append(df[df["CustomerID"]==customer].groupby("Month")["Total_Spent"].sum()/df[df["CustomerID"]==customer].groupby("Month")["Qta"].sum())
print(result)

onth
10    1.188772
dtype: float64, Month
10    1.859771
dtype: float64, Month
10    2.524286
11    2.410714
12    2.513250
dtype: float64, Month
10    1.01067
dtype: float64, Month
10    1.563151
dtype: float64, Month
10    2.927647
dtype: float64, Month
10    1.193005
11    1.926438
dtype: float64, Month
10    1.704293
11    1.689665
12    2.228681
dtype: float64, Month
10    1.517297
11    1.536552
dtype: float64, Month
10    0.650013
dtype: float64, Month
10    1.968199
dtype: float64, Month
10    2.30
11    1.95
dtype: float64, Month
10    1.978571
dtype: float64, Month
10    0.936760
12    1.474054
dtype: float64, Month
10    2.096984
dtype: float64, Month
10    2.444091
dtype: float64, Month
10    1.785937
11    1.880000
dtype: float64, Month
10    2.337966
dtype: float64, Month
10    1.123187
dtype: float64, Month
10    1.918969
dtype: float64, Month
10    2.974878
dtype: float64, Month
10    3.457357
dtype: float64, Month
10    1.2555
dtype: float64, Month
10    2.475
dtype: f

# Working on week

In [42]:
week = pd.DatetimeIndex(df['BasketDate']).week
df["Week"] = week
df.head()

Unnamed: 0,BasketID,BasketDate,Sale,CustomerID,CustomerCountry,ProdID,ProdDescr,Qta,Month,Total_Spent,Week
0,536365,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12,15.3,48
1,536365,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,71053,WHITE METAL LANTERN,6,12,20.34,48
2,536365,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,84406B,CREAM CUPID HEARTS COAT HANGER,8,12,22.0,48
3,536365,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12,20.34,48
4,536365,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12,20.34,48


## Average Qta per week

In [43]:
print(df.groupby(["CustomerID","Week"])["Qta"].mean())

CustomerID  Week
10023N      34      15.760000
10202N      29       1.000000
10353N      16       1.708333
10370N      49       2.433099
10374N      41       1.959184
                      ...    
99606N      43       6.833333
99733N      33       1.000000
9983N       13       8.000000
99958N      1        1.750000
99983N      39       2.072727
Name: Qta, Length: 16869, dtype: float64


## Average Total_Spent per week

In [44]:
print(df.groupby(["CustomerID","Week"])["Total_Spent"].mean())

CustomerID  Week
10023N      34      62.100000
10202N      29      12.750000
10353N      16       7.724583
10370N      49       9.789754
10374N      41       6.314490
                      ...    
99606N      43      37.026667
99733N      33       3.106667
9983N       13      20.000000
99958N      1        9.345417
99983N      39       5.998273
Name: Total_Spent, Length: 16869, dtype: float64


### Final Result

In [45]:
print("CustomerID == 16875")
print(df[df["CustomerID"]=="16875.0"].groupby("Week")["Qta"].mean())
print(df[df["CustomerID"]=="16875.0"].groupby("Week")["Total_Spent"].mean())

CustomerID == 16875
Week
1      6.363636
7     10.458333
29     7.461538
40    10.962264
Name: Qta, dtype: float64
Week
1     16.763636
7     18.178750
29    25.623846
40    18.062830
Name: Total_Spent, dtype: float64


In [46]:
result = []
for customer in df.CustomerID.unique():
    result.append(df[df["CustomerID"]==customer].groupby("Week")["Total_Spent"].sum()/df[df["CustomerID"]==customer].groupby("Week")["Qta"].sum())
print(result)

eek
43    2.524286
46    1.943333
47    2.632105
48    2.656552
49    2.135455
dtype: float64, Week
43    1.01067
dtype: float64, Week
43    1.563151
dtype: float64, Week
43    2.927647
dtype: float64, Week
43    1.193005
46    1.493429
47    2.325263
dtype: float64, Week
43    1.704293
47    1.689665
49    2.228681
dtype: float64, Week
43    1.517297
47    1.536552
dtype: float64, Week
43    0.650013
dtype: float64, Week
43    1.968199
dtype: float64, Week
43    2.30
46    1.95
dtype: float64, Week
43    1.978571
dtype: float64, Week
43    0.936760
48    1.474054
dtype: float64, Week
43    2.096984
dtype: float64, Week
43    2.444091
dtype: float64, Week
43    1.785937
48    1.880000
dtype: float64, Week
43    2.337966
dtype: float64, Week
44    1.123187
dtype: float64, Week
44    1.918969
dtype: float64, Week
44    2.974878
dtype: float64, Week
44    3.457357
dtype: float64, Week
44    1.2555
dtype: float64, Week
44    2.475
dtype: float64, Week
44    3.739059
dtype: float64, Week
44