In [1]:
# EP, NG, AP

In [2]:
import math
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('customer_supermarket.csv', sep='\t', index_col=0)

In [4]:
print(df.head())

  BasketID      BasketDate  Sale  CustomerID CustomerCountry  ProdID  \
0   536365  01/12/10 08:26  2,55     17850.0  United Kingdom  85123A   
1   536365  01/12/10 08:26  3,39     17850.0  United Kingdom   71053   
2   536365  01/12/10 08:26  2,75     17850.0  United Kingdom  84406B   
3   536365  01/12/10 08:26  3,39     17850.0  United Kingdom  84029G   
4   536365  01/12/10 08:26  3,39     17850.0  United Kingdom  84029E   

                             ProdDescr  Qta  
0   WHITE HANGING HEART T-LIGHT HOLDER    6  
1                  WHITE METAL LANTERN    6  
2       CREAM CUPID HEARTS COAT HANGER    8  
3  KNITTED UNION FLAG HOT WATER BOTTLE    6  
4       RED WOOLLY HOTTIE WHITE HEART.    6  


In [5]:
df.dtypes

BasketID            object
BasketDate          object
Sale                object
CustomerID         float64
CustomerCountry     object
ProdID              object
ProdDescr           object
Qta                  int64
dtype: object

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 471910 entries, 0 to 541909
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   BasketID         471910 non-null  object 
 1   BasketDate       471910 non-null  object 
 2   Sale             471910 non-null  object 
 3   CustomerID       406830 non-null  float64
 4   CustomerCountry  471910 non-null  object 
 5   ProdID           471910 non-null  object 
 6   ProdDescr        471157 non-null  object 
 7   Qta              471910 non-null  int64  
dtypes: float64(1), int64(1), object(6)
memory usage: 21.6+ MB


In [7]:
df.describe()

Unnamed: 0,CustomerID,Qta
count,406830.0,471910.0
mean,15287.68416,10.716533
std,1713.603074,231.355136
min,12346.0,-80995.0
25%,13953.0,1.0
50%,15152.0,4.0
75%,16791.0,12.0
max,18287.0,80995.0


In [8]:
print(df['Sale'])
print("-------------")
df['Sale'].replace(to_replace=r'(\d+),(\d*)', value=r'\1.\2', regex=True, inplace=True)

0         2,55
1         3,39
2         2,75
3         3,39
4         3,39
          ... 
541905     2,1
541906    4,15
541907    4,15
541908    4,95
541909      18
Name: Sale, Length: 471910, dtype: object
-------------


In [9]:
df['Sale'] = df['Sale'].astype(float)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 471910 entries, 0 to 541909
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   BasketID         471910 non-null  object 
 1   BasketDate       471910 non-null  object 
 2   Sale             471910 non-null  float64
 3   CustomerID       406830 non-null  float64
 4   CustomerCountry  471910 non-null  object 
 5   ProdID           471910 non-null  object 
 6   ProdDescr        471157 non-null  object 
 7   Qta              471910 non-null  int64  
dtypes: float64(2), int64(1), object(5)
memory usage: 23.4+ MB


In [11]:
df.isnull().any()

BasketID           False
BasketDate         False
Sale               False
CustomerID          True
CustomerCountry    False
ProdID             False
ProdDescr           True
Qta                False
dtype: bool

In [12]:
print("Quantity values")
print("TOT : ", len(df['Qta']))
print(" < 0: ", len(df[df['Qta'] < 0]))
print(" > 100: ", len(df[df['Qta'] > 100]))
print(" > 1000: ", len(df[df['Qta'] > 1000]))

Quantity values
TOT :  471910
 < 0:  9752
 > 100:  4814
 > 1000:  110


In [13]:
print("Sale values")
print("TOT : ", len(df['Sale']))
print(" < 0: ", len(df[df['Sale'] < 0]))
print(" > 100: ", len(df[df['Sale'] > 100]))
print(" > 1000: ", len(df[df['Sale'] > 1000]))

Sale values
TOT :  471910
 < 0:  2
 > 100:  634
 > 1000:  84


In [14]:
df['CustomerCountry'].value_counts()

United Kingdom          426261
Germany                   9495
France                    8525
EIRE                      7824
Spain                     2533
Netherlands               2371
Belgium                   2069
Switzerland               1932
Portugal                  1495
Australia                 1259
Norway                    1086
Italy                      803
Channel Islands            758
Finland                    695
Cyprus                     622
Sweden                     462
Austria                    401
Denmark                    389
Japan                      358
Poland                     341
Unspecified                340
USA                        291
Israel                     269
Singapore                  229
Iceland                    182
Canada                     151
Greece                     146
Hong Kong                  138
Malta                      127
United Arab Emirates        68
European Community          61
RSA                         58
Lebanon 

In [15]:
len(df[df['Qta'] < 0])

9752

In [16]:
len(df['BasketID'].unique())

24627

In [17]:
# -1 becuase there is the empty CustomerID
len(df['CustomerID'].unique()) - 1

4372

In [18]:
len(df['ProdID'].unique())

3953

In [19]:
bad_rows_indeces = []
tot = len(df)
for i in range(tot):
    print(f"{i}/{tot}")
    row = df.iloc[i]
    if pd.isnull(row['CustomerID']):
        if not pd.isnull(row['BasketID']):
            basketID = row['BasketID']
            for j in range(min(0, i-10), min(i+10, len(df))):
                if df.iloc[j]['BasketID'] == basketID:
                    row['CustomerID'] = df.iloc[j]['CustomerID']
                    break
        else:
            print(f"\nRow {i} is a bad row")
            bad_rows_indeces.append(i)
print("For the following rows it wasn't possible to find the CustomerID: ", bad_rows_indeces)

0/471910
1/471910
2/471910
3/471910
4/471910
5/471910
6/471910
7/471910
8/471910
9/471910
10/471910
11/471910
12/471910
13/471910
14/471910
15/471910
16/471910
17/471910
18/471910
19/471910
20/471910
21/471910
22/471910
23/471910
24/471910
25/471910
26/471910
27/471910
28/471910
29/471910
30/471910
31/471910
32/471910
33/471910
34/471910
35/471910
36/471910
37/471910
38/471910
39/471910
40/471910
41/471910
42/471910
43/471910
44/471910
45/471910
46/471910
47/471910
48/471910
49/471910
50/471910
51/471910
52/471910
53/471910
54/471910
55/471910
56/471910
57/471910
58/471910
59/471910
60/471910
61/471910
62/471910
63/471910
64/471910
65/471910
66/471910
67/471910
68/471910
69/471910
70/471910
71/471910
72/471910
73/471910
74/471910
75/471910
76/471910
77/471910
78/471910
79/471910
80/471910
81/471910
82/471910
83/471910
84/471910
85/471910
86/471910
87/471910
88/471910
89/471910
90/471910
91/471910
92/471910
93/471910
94/471910
95/471910
96/471910
97/471910
98/471910
99/471910
100/471910

795/471910
796/471910
797/471910
798/471910
799/471910
800/471910
801/471910
802/471910
803/471910
804/471910
805/471910
806/471910
807/471910
808/471910
809/471910
810/471910
811/471910
812/471910
813/471910
814/471910
815/471910
816/471910
817/471910
818/471910
819/471910
820/471910
821/471910
822/471910
823/471910
824/471910
825/471910
826/471910
827/471910
828/471910
829/471910
830/471910
831/471910
832/471910
833/471910
834/471910
835/471910
836/471910
837/471910
838/471910
839/471910
840/471910
841/471910
842/471910
843/471910
844/471910
845/471910
846/471910
847/471910
848/471910
849/471910
850/471910
851/471910
852/471910
853/471910
854/471910
855/471910
856/471910
857/471910
858/471910
859/471910
860/471910
861/471910
862/471910
863/471910
864/471910
865/471910
866/471910
867/471910
868/471910
869/471910
870/471910
871/471910
872/471910
873/471910
874/471910
875/471910
876/471910
877/471910
878/471910
879/471910
880/471910
881/471910
882/471910
883/471910
884/471910
885/471910

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row['CustomerID'] = df.iloc[j]['CustomerID']


1443/471910
1444/471910
1445/471910
1446/471910
1447/471910
1448/471910
1449/471910
1450/471910
1451/471910
1452/471910
1453/471910
1454/471910
1455/471910
1456/471910
1457/471910
1458/471910
1459/471910
1460/471910
1461/471910
1462/471910
1463/471910
1464/471910
1465/471910
1466/471910
1467/471910
1468/471910
1469/471910
1470/471910
1471/471910
1472/471910
1473/471910
1474/471910
1475/471910
1476/471910
1477/471910
1478/471910
1479/471910
1480/471910
1481/471910
1482/471910
1483/471910
1484/471910
1485/471910
1486/471910
1487/471910
1488/471910
1489/471910
1490/471910
1491/471910
1492/471910
1493/471910
1494/471910
1495/471910
1496/471910
1497/471910
1498/471910
1499/471910
1500/471910
1501/471910
1502/471910
1503/471910
1504/471910
1505/471910
1506/471910
1507/471910
1508/471910
1509/471910
1510/471910
1511/471910
1512/471910
1513/471910
1514/471910
1515/471910
1516/471910
1517/471910
1518/471910
1519/471910
1520/471910
1521/471910
1522/471910
1523/471910
1524/471910
1525/471910
1526

2198/471910
2199/471910
2200/471910
2201/471910
2202/471910
2203/471910
2204/471910
2205/471910
2206/471910
2207/471910
2208/471910
2209/471910
2210/471910
2211/471910
2212/471910
2213/471910
2214/471910
2215/471910
2216/471910
2217/471910
2218/471910
2219/471910
2220/471910
2221/471910
2222/471910
2223/471910
2224/471910
2225/471910
2226/471910
2227/471910
2228/471910
2229/471910
2230/471910
2231/471910
2232/471910
2233/471910
2234/471910
2235/471910
2236/471910
2237/471910
2238/471910
2239/471910
2240/471910
2241/471910
2242/471910
2243/471910
2244/471910
2245/471910
2246/471910
2247/471910
2248/471910
2249/471910
2250/471910
2251/471910
2252/471910
2253/471910
2254/471910
2255/471910
2256/471910
2257/471910
2258/471910
2259/471910
2260/471910
2261/471910
2262/471910
2263/471910
2264/471910
2265/471910
2266/471910
2267/471910
2268/471910
2269/471910
2270/471910
2271/471910
2272/471910
2273/471910
2274/471910
2275/471910
2276/471910
2277/471910
2278/471910
2279/471910
2280/471910
2281

KeyboardInterrupt: 