In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style ='whitegrid')
pd.set_option('display.max_columns',None)

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import warnings,datetime,calendar
warnings.filterwarnings('ignore')

In [53]:
data = pd.read_csv('OnlineRetail.csv', encoding= 'unicode_escape')

2) Reading Data

# 3) Data Cleaning

In [54]:
data.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

In [55]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


In [56]:
data[data['Quantity']<=0]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
141,C536379,D,Discount,-1,01-12-2010 09:41,27.50,14527.0,United Kingdom
154,C536383,35004C,SET OF 3 COLOURED FLYING DUCKS,-1,01-12-2010 09:49,4.65,15311.0,United Kingdom
235,C536391,22556,PLASTERS IN TIN CIRCUS PARADE,-12,01-12-2010 10:24,1.65,17548.0,United Kingdom
236,C536391,21984,PACK OF 12 PINK PAISLEY TISSUES,-24,01-12-2010 10:24,0.29,17548.0,United Kingdom
237,C536391,21983,PACK OF 12 BLUE PAISLEY TISSUES,-24,01-12-2010 10:24,0.29,17548.0,United Kingdom
...,...,...,...,...,...,...,...,...
540449,C581490,23144,ZINC T-LIGHT HOLDER STARS SMALL,-11,09-12-2011 09:57,0.83,14397.0,United Kingdom
541541,C581499,M,Manual,-1,09-12-2011 10:28,224.69,15498.0,United Kingdom
541715,C581568,21258,VICTORIAN SEWING BOX LARGE,-5,09-12-2011 11:57,10.95,15311.0,United Kingdom
541716,C581569,84978,HANGING HEART JAR T-LIGHT HOLDER,-1,09-12-2011 11:58,1.25,17315.0,United Kingdom


In [57]:
data[data['UnitPrice']<=0]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
622,536414,22139,,56,01-12-2010 11:52,0.0,,United Kingdom
1970,536545,21134,,1,01-12-2010 14:32,0.0,,United Kingdom
1971,536546,22145,,1,01-12-2010 14:33,0.0,,United Kingdom
1972,536547,37509,,1,01-12-2010 14:33,0.0,,United Kingdom
1987,536549,85226A,,1,01-12-2010 14:34,0.0,,United Kingdom
...,...,...,...,...,...,...,...,...
536981,581234,72817,,27,08-12-2011 10:33,0.0,,United Kingdom
538504,581406,46000M,POLYESTER FILLER PAD 45x45cm,240,08-12-2011 13:58,0.0,,United Kingdom
538505,581406,46000S,POLYESTER FILLER PAD 40x40cm,300,08-12-2011 13:58,0.0,,United Kingdom
538554,581408,85175,,20,08-12-2011 14:06,0.0,,United Kingdom


In [58]:
Wrongindices = []

Wrongindices = list(data[data['Quantity']<=0].index)
Wrongindices.extend(data[data['UnitPrice']<=0].index)
len(Wrongindices), len(set(Wrongindices))


(13141, 11805)

In [59]:
Wrongindices = list(set(Wrongindices))

In [60]:
len(Wrongindices)

11805

In [61]:
data.drop(Wrongindices,axis=0,inplace=True)
data.reset_index(inplace=True)
data.drop(['index'],axis = 1,inplace=True)
data.shape

(530104, 8)

# Nulls

In [62]:
data.isna().sum()

InvoiceNo           0
StockCode           0
Description         0
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     132220
Country             0
dtype: int64

In [63]:
'No ID' in data['CustomerID'].tolist()

False

In [64]:
data['CustomerID'].fillna('No ID' , inplace = True)

In [65]:
data['Member'] = data['CustomerID'].apply(lambda x : 0 if x =='No ID' else 1)
data['Member'].value_counts()

1    397884
0    132220
Name: Member, dtype: int64

In [66]:
data.isna().sum()

InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
Member         0
dtype: int64

In [67]:
c = 0
for i , row in enumerate(data.duplicated().tolist()):
    if row == True:
        c+=1
print(c)

5226


In [68]:
data.drop_duplicates(inplace=True)
data.reset_index(inplace=True)
data.drop(['index'],axis = 1,inplace=True)
data.shape

(524878, 9)

# 4) Data Processing

In [69]:
data.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country', 'Member'],
      dtype='object')

In [70]:
def ShowDetails() : 
    global data
    for col in data.columns : 
        print(f' for feature {col}')
        print(f'Number of Unique Values {len(data[col].unique())}')
        print(f' Random Value {data[col][np.random.randint(data.shape[0])]}')
        print(f' Random Value {data[col][np.random.randint(data.shape[0])]}')
        print(f' Random Value {data[col][np.random.randint(data.shape[0])]}')
        print('-------------------------------------------------------')

In [71]:
ShowDetails()

 for feature InvoiceNo
Number of Unique Values 19960
 Random Value 552285
 Random Value 579558
 Random Value 561102
-------------------------------------------------------
 for feature StockCode
Number of Unique Values 3922
 Random Value 23174
 Random Value 85099C
 Random Value 22628
-------------------------------------------------------
 for feature Description
Number of Unique Values 4026
 Random Value SET/3 RED GINGHAM ROSE STORAGE BOX
 Random Value PARTY INVITES JAZZ HEARTS
 Random Value CHILLI LIGHTS
-------------------------------------------------------
 for feature Quantity
Number of Unique Values 375
 Random Value 3
 Random Value 10
 Random Value 12
-------------------------------------------------------
 for feature InvoiceDate
Number of Unique Values 18499
 Random Value 04-04-2011 11:28
 Random Value 05-01-2011 10:35
 Random Value 28-03-2011 09:16
-------------------------------------------------------
 for feature UnitPrice
Number of Unique Values 1291
 Random Value 0.42
 

In [72]:
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'],format = '%d-%m-%Y %H:%M')
data['InvoiceDate']

0        2010-12-01 08:26:00
1        2010-12-01 08:26:00
2        2010-12-01 08:26:00
3        2010-12-01 08:26:00
4        2010-12-01 08:26:00
                 ...        
524873   2011-12-09 12:50:00
524874   2011-12-09 12:50:00
524875   2011-12-09 12:50:00
524876   2011-12-09 12:50:00
524877   2011-12-09 12:50:00
Name: InvoiceDate, Length: 524878, dtype: datetime64[ns]

In [73]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 524878 entries, 0 to 524877
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    524878 non-null  object        
 1   StockCode    524878 non-null  object        
 2   Description  524878 non-null  object        
 3   Quantity     524878 non-null  int64         
 4   InvoiceDate  524878 non-null  datetime64[ns]
 5   UnitPrice    524878 non-null  float64       
 6   CustomerID   524878 non-null  object        
 7   Country      524878 non-null  object        
 8   Member       524878 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(2), object(5)
memory usage: 36.0+ MB


In [74]:
data['Quantity'].min(),data['Quantity'].max()

(1, 80995)

In [75]:
data['InvoiceDate'].min(),data['InvoiceDate'].max()

(Timestamp('2010-12-01 08:26:00'), Timestamp('2011-12-09 12:50:00'))

In [76]:
data['Year'] = data['InvoiceDate'].dt.year
data['Month'] = data['InvoiceDate'].dt.month
data['Day'] = data['InvoiceDate'].dt.day
data['Hour'] = data['InvoiceDate'].dt.hour
data['WeekDay'] = data['InvoiceDate'].dt.day_name()
data['WeekDayCase'] = data['WeekDay'].apply(lambda x : 'WeekEnd' if str(x).lower() in ['saturday','sunday'] else 'WeekDay')
data

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Member,Year,Month,Day,Hour,WeekDay,WeekDayCase
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
524873,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France,1,2011,12,9,12,Friday,WeekDay
524874,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France,1,2011,12,9,12,Friday,WeekDay
524875,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France,1,2011,12,9,12,Friday,WeekDay
524876,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France,1,2011,12,9,12,Friday,WeekDay


In [77]:
data['WeekDayCase'].value_counts()

WeekDay    462539
WeekEnd     62339
Name: WeekDayCase, dtype: int64

In [78]:
data['WeekDay'].value_counts()

Thursday     100213
Tuesday       98726
Monday        92466
Wednesday     91467
Friday        79667
Sunday        62339
Name: WeekDay, dtype: int64

In [79]:
def DayPart(H) :
    
    if H <6 : 
        return 'Early Morning'
    elif 6<= H< 12 : 
        return 'Morning'
    elif 12<= H< 15 : 
        return 'Noon'
    elif 15<= H< 19 : 
        return 'Evening'
    else : 
        return 'Night'

In [80]:
data['DayPart'] = data['Hour'].apply(lambda x : DayPart(int(x)))

In [81]:
data['DayPart'].value_counts()

Noon       211036
Evening    163759
Morning    145878
Night        4205
Name: DayPart, dtype: int64

In [82]:
data

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Member,Year,Month,Day,Hour,WeekDay,WeekDayCase,DayPart
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay,Morning
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay,Morning
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay,Morning
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay,Morning
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay,Morning
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
524873,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France,1,2011,12,9,12,Friday,WeekDay,Noon
524874,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France,1,2011,12,9,12,Friday,WeekDay,Noon
524875,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France,1,2011,12,9,12,Friday,WeekDay,Noon
524876,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France,1,2011,12,9,12,Friday,WeekDay,Noon


In [83]:
data['Desc Char Length'] = data['Description'].apply(lambda x :  len(x))
data['Desc Word Length'] = data['Description'].apply(lambda x :  len(x.split()))
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Member,Year,Month,Day,Hour,WeekDay,WeekDayCase,DayPart,Desc Char Length,Desc Word Length
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay,Morning,34,5
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay,Morning,19,3
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay,Morning,30,5
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay,Morning,35,6
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay,Morning,30,5


In [84]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 524878 entries, 0 to 524877
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   InvoiceNo         524878 non-null  object        
 1   StockCode         524878 non-null  object        
 2   Description       524878 non-null  object        
 3   Quantity          524878 non-null  int64         
 4   InvoiceDate       524878 non-null  datetime64[ns]
 5   UnitPrice         524878 non-null  float64       
 6   CustomerID        524878 non-null  object        
 7   Country           524878 non-null  object        
 8   Member            524878 non-null  int64         
 9   Year              524878 non-null  int64         
 10  Month             524878 non-null  int64         
 11  Day               524878 non-null  int64         
 12  Hour              524878 non-null  int64         
 13  WeekDay           524878 non-null  object        
 14  Week

In [85]:
data[data['Desc Char Length']==35]['Description'].unique()

array(['KNITTED UNION FLAG HOT WATER BOTTLE',
       'ROUND SNACK BOXES SET OF4 WOODLAND ',
       'CHARLIE & LOLA WASTEPAPER BIN FLORA',
       'CLEAR DRAWER KNOB ACRYLIC EDWARDIAN',
       'COLOUR GLASS T-LIGHT HOLDER HANGING',
       'CLASSIC METAL BIRDCAGE PLANT HOLDER',
       'FRIDGE MAGNETS LES ENFANTS ASSORTED',
       'SET/20 RED RETROSPOT PAPER NAPKINS ',
       'DISCO BALL ROTATOR BATTERY OPERATED',
       'BLUE NEW BAROQUE CANDLESTICK CANDLE',
       'GREEN DRAWER KNOB ACRYLIC EDWARDIAN',
       'SET/5 RED RETROSPOT LID GLASS BOWLS',
       'ROTATING SILVER ANGELS T-LIGHT HLDR',
       'SQUARECUSHION COVER PINK UNION FLAG',
       'SET 12 LAVENDER  BOTANICAL T-LIGHTS',
       'MAGIC DRAWING SLATE CIRCUS PARADE  ',
       'PAINTED METAL STAR WITH HOLLY BELLS',
       'PAINTED METAL HEART WITH HOLLY BELL',
       'FOLDING UMBRELLA PINKWHITE POLKADOT',
       'FOLDING UMBRELLA WHITE/RED POLKADOT',
       'FOLDING UMBRELLA RED/WHITE POLKADOT',
       'SET OF 20 VINTAGE CHRISTMA

In [86]:
data[data['Desc Word Length']==1]['Description'].unique()

array(['POSTAGE', 'CARRIAGE', 'Manual', 'SOMBRERO ', 'SAMPLES'],
      dtype=object)

In [87]:
AllDescription = ' '.join(data['Description'].tolist())
AllDescription = ' '.join([(i).lower() for i in AllDescription.split()])
len(AllDescription)

14392144

In [88]:
MostRepeatedWords = {}

for Word in AllDescription.split() : 
    if Word in MostRepeatedWords.keys() : 
        MostRepeatedWords[Word] +=1
    else : 
        MostRepeatedWords[Word] =1
MostRepeatedWords

{'white': 20198,
 'hanging': 16452,
 'heart': 37841,
 't-light': 14063,
 'holder': 15710,
 'metal': 20396,
 'lantern': 2270,
 'cream': 7760,
 'cupid': 333,
 'hearts': 4910,
 'coat': 1570,
 'hanger': 2145,
 'knitted': 786,
 'union': 5344,
 'flag': 1745,
 'hot': 10293,
 'water': 9771,
 'bottle': 11463,
 'red': 41742,
 'woolly': 438,
 'hottie': 438,
 'heart.': 438,
 'set': 53069,
 '7': 623,
 'babushka': 3155,
 'nesting': 498,
 'boxes': 3145,
 'glass': 11527,
 'star': 5948,
 'frosted': 154,
 'hand': 4796,
 'warmer': 4479,
 'jack': 3723,
 'polka': 194,
 'dot': 107,
 'assorted': 7359,
 'colour': 6040,
 'bird': 5743,
 'ornament': 1476,
 "poppy's": 1422,
 'playhouse': 1422,
 'bedroom': 425,
 'kitchen': 3455,
 'feltcraft': 7041,
 'princess': 1164,
 'charlotte': 6213,
 'doll': 2183,
 'ivory': 7943,
 'mug': 7917,
 'cosy': 1276,
 'box': 23554,
 'of': 52015,
 '6': 12865,
 'teaspoons': 210,
 'vintage': 32964,
 'jigsaw': 1623,
 'blocks': 502,
 'alphabet': 3830,
 'home': 9275,
 'building': 1686,
 'blo

In [89]:
MostRepeatedWords=  {k:v for k,v in sorted(MostRepeatedWords.items(),key = lambda item: item[1],reverse = True) }
MostRepeatedWords

{'set': 53069,
 'of': 52015,
 'bag': 50823,
 'red': 41742,
 'heart': 37841,
 'retrospot': 34124,
 'vintage': 32964,
 'design': 29200,
 'pink': 28869,
 'christmas': 24587,
 'box': 23554,
 'jumbo': 20724,
 'cake': 20681,
 'metal': 20396,
 'white': 20198,
 'blue': 18804,
 'lunch': 18007,
 '3': 17893,
 'sign': 16677,
 'hanging': 16452,
 'holder': 15710,
 'pack': 15319,
 'paper': 14135,
 't-light': 14063,
 'small': 13945,
 'card': 13058,
 '6': 12865,
 'decoration': 12716,
 'wooden': 12435,
 'polkadot': 12228,
 'cases': 11721,
 'glass': 11527,
 'tea': 11519,
 'bottle': 11463,
 '12': 11191,
 'in': 10959,
 'and': 10606,
 'spaceboy': 10570,
 'hot': 10293,
 'water': 9771,
 'with': 9519,
 'large': 9428,
 'pantry': 9403,
 'home': 9275,
 'tin': 9191,
 'rose': 9163,
 'paisley': 9149,
 '4': 8925,
 'green': 8865,
 'ceramic': 8757,
 'regency': 8613,
 'doormat': 8255,
 'dolly': 8221,
 'mini': 7970,
 'ivory': 7943,
 'mug': 7917,
 'bunting': 7881,
 'love': 7793,
 'cream': 7760,
 'girl': 7601,
 'party': 74

In [90]:
ImportantWords = ['bag','red','heart','retrospot','vintage','design','pink','christmas','box','kitchen'] 

In [91]:
GuessedWords  = ['car','electric','kitchen','electronic']

for Word in GuessedWords : 
    if Word in MostRepeatedWords.keys():
        print(f'for Word {Word} repeated {MostRepeatedWords[Word]}')

for Word car repeated 184
for Word kitchen repeated 3455
for Word electronic repeated 2


In [92]:
for Word in ImportantWords : 
    data[f'Repeated Word : {Word}'] = data['Description'].apply(lambda x : 1 if Word in x.lower() else 0)

In [93]:
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Member,Year,Month,Day,Hour,WeekDay,WeekDayCase,DayPart,Desc Char Length,Desc Word Length,Repeated Word : bag,Repeated Word : red,Repeated Word : heart,Repeated Word : retrospot,Repeated Word : vintage,Repeated Word : design,Repeated Word : pink,Repeated Word : christmas,Repeated Word : box,Repeated Word : kitchen
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay,Morning,34,5,0,0,1,0,0,0,0,0,0,0
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay,Morning,19,3,0,0,0,0,0,0,0,0,0,0
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay,Morning,30,5,0,0,1,0,0,0,0,0,0,0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay,Morning,35,6,0,0,0,0,0,0,0,0,0,0
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay,Morning,30,5,0,1,1,0,0,0,0,0,0,0


In [94]:
data['Country'].unique()

array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',
       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Finland',
       'Austria', 'Bahrain', 'Israel', 'Greece', 'Hong Kong', 'Singapore',
       'Lebanon', 'United Arab Emirates', 'Saudi Arabia',
       'Czech Republic', 'Canada', 'Unspecified', 'Brazil', 'USA',
       'European Community', 'Malta', 'RSA'], dtype=object)

In [95]:
data['Country'].replace('Israel','Palestine',inplace=True)
data['Country'].unique()

array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',
       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Finland',
       'Austria', 'Bahrain', 'Palestine', 'Greece', 'Hong Kong',
       'Singapore', 'Lebanon', 'United Arab Emirates', 'Saudi Arabia',
       'Czech Republic', 'Canada', 'Unspecified', 'Brazil', 'USA',
       'European Community', 'Malta', 'RSA'], dtype=object)

In [96]:
ContinentsDict = {'Europe':['United Kingdom','France','Netherlands','Germany','Norway'
                            ,'EIRE','Switzerland','Spain', 'Poland', 'Portugal','Italy'
                            ,'Belgium','Lithuania','Iceland','Channel Islands','Denmark'
                            ,'Cyprus','Sweden','Finland','Austria','Greece','Czech Republic'
                            ,'European Community','Malta'],
              'Asia':['Japan','Bahrain','Palestine','Hong Kong','Singapore','Lebanon',
                      'United Arab Emirates','Saudi Arabia'],
              'Americans':['Canada',  'Brazil', 'USA'],
              'Other':['Australia','Unspecified','RSA']}
ContinentsDict

{'Europe': ['United Kingdom',
  'France',
  'Netherlands',
  'Germany',
  'Norway',
  'EIRE',
  'Switzerland',
  'Spain',
  'Poland',
  'Portugal',
  'Italy',
  'Belgium',
  'Lithuania',
  'Iceland',
  'Channel Islands',
  'Denmark',
  'Cyprus',
  'Sweden',
  'Finland',
  'Austria',
  'Greece',
  'Czech Republic',
  'European Community',
  'Malta'],
 'Asia': ['Japan',
  'Bahrain',
  'Palestine',
  'Hong Kong',
  'Singapore',
  'Lebanon',
  'United Arab Emirates',
  'Saudi Arabia'],
 'Americans': ['Canada', 'Brazil', 'USA'],
 'Other': ['Australia', 'Unspecified', 'RSA']}

In [97]:
def GetContinent(Country) : 
    global ContinentsDict
    for Key in ContinentsDict.keys() : 
        if Country in ContinentsDict[Key] : 
            return Key
    
data['Continent'] = data['Country'].apply(lambda x: GetContinent(x))

In [98]:
for Continent in ContinentsDict.keys() : 
    Countries = ContinentsDict[Continent]
    print(f'for Continent {Continent} , Countries are : {Countries}')
    for Country in Countries : 
        print(data[data['Country']==Country]['Continent'].unique())
    print('=========================================')

for Continent Europe , Countries are : ['United Kingdom', 'France', 'Netherlands', 'Germany', 'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal', 'Italy', 'Belgium', 'Lithuania', 'Iceland', 'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Finland', 'Austria', 'Greece', 'Czech Republic', 'European Community', 'Malta']
['Europe']
['Europe']
['Europe']
['Europe']
['Europe']
['Europe']
['Europe']
['Europe']
['Europe']
['Europe']
['Europe']
['Europe']
['Europe']
['Europe']
['Europe']
['Europe']
['Europe']
['Europe']
['Europe']
['Europe']
['Europe']
['Europe']
['Europe']
['Europe']
for Continent Asia , Countries are : ['Japan', 'Bahrain', 'Palestine', 'Hong Kong', 'Singapore', 'Lebanon', 'United Arab Emirates', 'Saudi Arabia']
['Asia']
['Asia']
['Asia']
['Asia']
['Asia']
['Asia']
['Asia']
['Asia']
for Continent Americans , Countries are : ['Canada', 'Brazil', 'USA']
['Americans']
['Americans']
['Americans']
for Continent Other , Countries are : ['Australia', 'Unspecified', 'RSA

In [99]:
# pip install geopy


In [100]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="catuserbot")
def GeoLocate(country):
    try:
        loc = geolocator.geocode(country)
        return (loc.latitude, loc.longitude)
    except:
        return np.nan

In [101]:
data['Country'].unique()

array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',
       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Finland',
       'Austria', 'Bahrain', 'Palestine', 'Greece', 'Hong Kong',
       'Singapore', 'Lebanon', 'United Arab Emirates', 'Saudi Arabia',
       'Czech Republic', 'Canada', 'Unspecified', 'Brazil', 'USA',
       'European Community', 'Malta', 'RSA'], dtype=object)

In [102]:
#  pip install countryinfo

In [103]:
from countryinfo import CountryInfo

In [104]:
CapitalsDict = {}

for Country in data['Country'].unique() : 
    try : 
        CapitalsDict[Country] = CountryInfo(Country).capital()
    except :
        print(Country)
CapitalsDict    

EIRE
Channel Islands
Palestine
Unspecified
European Community


{'United Kingdom': 'London',
 'France': 'Paris',
 'Australia': 'Canberra',
 'Netherlands': 'Amsterdam',
 'Germany': 'Berlin',
 'Norway': 'Oslo',
 'Switzerland': 'Bern',
 'Spain': 'Madrid',
 'Poland': 'Warsaw',
 'Portugal': 'Lisbon',
 'Italy': 'Rome',
 'Belgium': 'Brussels',
 'Lithuania': 'Vilnius',
 'Japan': 'Tokyo',
 'Iceland': 'Reykjavik',
 'Denmark': 'Copenhagen',
 'Cyprus': 'Nicosia',
 'Sweden': 'Stockholm',
 'Finland': 'Helsinki',
 'Austria': 'Vienna',
 'Bahrain': 'Manama',
 'Greece': 'Athens',
 'Hong Kong': 'City of Victoria',
 'Singapore': 'Singapore',
 'Lebanon': 'Beirut',
 'United Arab Emirates': 'Abu Dhabi',
 'Saudi Arabia': 'Riyadh',
 'Czech Republic': 'Prague',
 'Canada': 'Ottawa',
 'Brazil': 'Brasília',
 'USA': 'Washington D.C.',
 'Malta': 'Valletta',
 'RSA': 'Pretoria'}

In [105]:
CapitalsDict['EIRE'] = 'Dublin'
CapitalsDict['Channel Islands'] = 'Peter Port'
CapitalsDict['Palestine'] = 'Quds'
CapitalsDict

{'United Kingdom': 'London',
 'France': 'Paris',
 'Australia': 'Canberra',
 'Netherlands': 'Amsterdam',
 'Germany': 'Berlin',
 'Norway': 'Oslo',
 'Switzerland': 'Bern',
 'Spain': 'Madrid',
 'Poland': 'Warsaw',
 'Portugal': 'Lisbon',
 'Italy': 'Rome',
 'Belgium': 'Brussels',
 'Lithuania': 'Vilnius',
 'Japan': 'Tokyo',
 'Iceland': 'Reykjavik',
 'Denmark': 'Copenhagen',
 'Cyprus': 'Nicosia',
 'Sweden': 'Stockholm',
 'Finland': 'Helsinki',
 'Austria': 'Vienna',
 'Bahrain': 'Manama',
 'Greece': 'Athens',
 'Hong Kong': 'City of Victoria',
 'Singapore': 'Singapore',
 'Lebanon': 'Beirut',
 'United Arab Emirates': 'Abu Dhabi',
 'Saudi Arabia': 'Riyadh',
 'Czech Republic': 'Prague',
 'Canada': 'Ottawa',
 'Brazil': 'Brasília',
 'USA': 'Washington D.C.',
 'Malta': 'Valletta',
 'RSA': 'Pretoria',
 'EIRE': 'Dublin',
 'Channel Islands': 'Peter Port',
 'Palestine': 'Quds'}

In [106]:
LocationDict = {}
for Country in CapitalsDict.keys() : 
    a,b = GeoLocate(CapitalsDict[Country])
    LocationDict[Country] = [a,b]
LocationDict  

{'United Kingdom': [51.5073359, -0.12765],
 'France': [48.8534951, 2.3483915],
 'Australia': [-35.2975906, 149.1012676],
 'Netherlands': [52.3730796, 4.8924534],
 'Germany': [52.5170365, 13.3888599],
 'Norway': [59.9133301, 10.7389701],
 'Switzerland': [46.9484742, 7.4521749],
 'Spain': [40.4167047, -3.7035825],
 'Poland': [52.2319581, 21.0067249],
 'Portugal': [38.7077507, -9.1365919],
 'Italy': [41.8933203, 12.4829321],
 'Belgium': [50.8465573, 4.351697],
 'Lithuania': [54.6870458, 25.2829111],
 'Japan': [35.6812665, 139.757653],
 'Iceland': [64.145981, -21.9422367],
 'Denmark': [55.6867243, 12.5700724],
 'Cyprus': [35.1748976, 33.3638568],
 'Sweden': [59.3251172, 18.0710935],
 'Finland': [60.1674881, 24.9427473],
 'Austria': [48.2083537, 16.3725042],
 'Bahrain': [26.2235041, 50.5822436],
 'Greece': [37.9839412, 23.7283052],
 'Hong Kong': [22.27658545, 114.16930117418485],
 'Singapore': [1.357107, 103.8194992],
 'Lebanon': [33.88922645, 35.50255852895232],
 'United Arab Emirates': [2

In [107]:
len(data['Country'].unique()),len(list(LocationDict.keys()))

(38, 36)

In [108]:
[i for i in data['Country'].unique() if not i in LocationDict.keys()]

['Unspecified', 'European Community']

In [109]:
LocationDict['Unspecified'] = [0,0]
LocationDict['European Community'] = [51.5073219, -0.1276474]

LocationDict

{'United Kingdom': [51.5073359, -0.12765],
 'France': [48.8534951, 2.3483915],
 'Australia': [-35.2975906, 149.1012676],
 'Netherlands': [52.3730796, 4.8924534],
 'Germany': [52.5170365, 13.3888599],
 'Norway': [59.9133301, 10.7389701],
 'Switzerland': [46.9484742, 7.4521749],
 'Spain': [40.4167047, -3.7035825],
 'Poland': [52.2319581, 21.0067249],
 'Portugal': [38.7077507, -9.1365919],
 'Italy': [41.8933203, 12.4829321],
 'Belgium': [50.8465573, 4.351697],
 'Lithuania': [54.6870458, 25.2829111],
 'Japan': [35.6812665, 139.757653],
 'Iceland': [64.145981, -21.9422367],
 'Denmark': [55.6867243, 12.5700724],
 'Cyprus': [35.1748976, 33.3638568],
 'Sweden': [59.3251172, 18.0710935],
 'Finland': [60.1674881, 24.9427473],
 'Austria': [48.2083537, 16.3725042],
 'Bahrain': [26.2235041, 50.5822436],
 'Greece': [37.9839412, 23.7283052],
 'Hong Kong': [22.27658545, 114.16930117418485],
 'Singapore': [1.357107, 103.8194992],
 'Lebanon': [33.88922645, 35.50255852895232],
 'United Arab Emirates': [2

In [110]:
data['Latt'] = data['Country'].apply(lambda x : LocationDict[x][0])
data['Long'] = data['Country'].apply(lambda x : LocationDict[x][1])
data

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Member,Year,Month,Day,Hour,WeekDay,WeekDayCase,DayPart,Desc Char Length,Desc Word Length,Repeated Word : bag,Repeated Word : red,Repeated Word : heart,Repeated Word : retrospot,Repeated Word : vintage,Repeated Word : design,Repeated Word : pink,Repeated Word : christmas,Repeated Word : box,Repeated Word : kitchen,Continent,Latt,Long
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay,Morning,34,5,0,0,1,0,0,0,0,0,0,0,Europe,51.507336,-0.127650
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay,Morning,19,3,0,0,0,0,0,0,0,0,0,0,Europe,51.507336,-0.127650
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay,Morning,30,5,0,0,1,0,0,0,0,0,0,0,Europe,51.507336,-0.127650
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay,Morning,35,6,0,0,0,0,0,0,0,0,0,0,Europe,51.507336,-0.127650
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay,Morning,30,5,0,1,1,0,0,0,0,0,0,0,Europe,51.507336,-0.127650
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
524873,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France,1,2011,12,9,12,Friday,WeekDay,Noon,27,5,0,0,0,0,0,0,0,0,0,0,Europe,48.853495,2.348391
524874,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France,1,2011,12,9,12,Friday,WeekDay,Noon,28,4,0,0,0,0,0,0,0,0,0,0,Europe,48.853495,2.348391
524875,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France,1,2011,12,9,12,Friday,WeekDay,Noon,29,4,0,0,0,0,0,0,0,0,0,0,Europe,48.853495,2.348391
524876,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France,1,2011,12,9,12,Friday,WeekDay,Noon,31,4,0,0,0,0,0,0,0,0,0,0,Europe,48.853495,2.348391


In [111]:
data['TotalPrice'] = data.apply(lambda x : x['Quantity'] * x['UnitPrice'],axis=1)
data

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Member,Year,Month,Day,Hour,WeekDay,WeekDayCase,DayPart,Desc Char Length,Desc Word Length,Repeated Word : bag,Repeated Word : red,Repeated Word : heart,Repeated Word : retrospot,Repeated Word : vintage,Repeated Word : design,Repeated Word : pink,Repeated Word : christmas,Repeated Word : box,Repeated Word : kitchen,Continent,Latt,Long,TotalPrice
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay,Morning,34,5,0,0,1,0,0,0,0,0,0,0,Europe,51.507336,-0.127650,15.30
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay,Morning,19,3,0,0,0,0,0,0,0,0,0,0,Europe,51.507336,-0.127650,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay,Morning,30,5,0,0,1,0,0,0,0,0,0,0,Europe,51.507336,-0.127650,22.00
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay,Morning,35,6,0,0,0,0,0,0,0,0,0,0,Europe,51.507336,-0.127650,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay,Morning,30,5,0,1,1,0,0,0,0,0,0,0,Europe,51.507336,-0.127650,20.34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
524873,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France,1,2011,12,9,12,Friday,WeekDay,Noon,27,5,0,0,0,0,0,0,0,0,0,0,Europe,48.853495,2.348391,10.20
524874,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France,1,2011,12,9,12,Friday,WeekDay,Noon,28,4,0,0,0,0,0,0,0,0,0,0,Europe,48.853495,2.348391,12.60
524875,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France,1,2011,12,9,12,Friday,WeekDay,Noon,29,4,0,0,0,0,0,0,0,0,0,0,Europe,48.853495,2.348391,16.60
524876,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France,1,2011,12,9,12,Friday,WeekDay,Noon,31,4,0,0,0,0,0,0,0,0,0,0,Europe,48.853495,2.348391,16.60


In [114]:
def QuantityLevel(x) : 
    x = int(x)
    if x < 10 : 
        return 'Low'
    elif 10<= x <100 : 
        return 'Medium'
    else :
        return 'Huge'
data['Quantity Level'] =data['Quantity'] .apply(lambda x : QuantityLevel(x))
data['Quantity Level'].value_counts()

Low       370741
Medium    147485
Huge        6652
Name: Quantity Level, dtype: int64

In [116]:
def UnitPriceLevel(x) : 
    x = int(x)
    if x < 1 : 
        return 'Cheap'
    elif 1<= x <10 : 
        return 'Medium'
    else :
        return 'Expensive'
data['UnitPrice Level'] =data['UnitPrice'] .apply(lambda x : UnitPriceLevel(x))
data['UnitPrice Level'].value_counts()

Medium       391399
Cheap        109584
Expensive     23895
Name: UnitPrice Level, dtype: int64

In [117]:
def TotalPriceLevel(x) : 
    x = int(x)
    if x < 5 : 
        return 'Cheap'
    elif 5<= x <20 : 
        return 'Medium'
    else :
        return 'Expensive'
data['TotalPriceLevel'] =data['TotalPrice'] .apply(lambda x : TotalPriceLevel(x))
data['TotalPriceLevel'].value_counts()

Medium       253145
Cheap        170809
Expensive    100924
Name: TotalPriceLevel, dtype: int64

In [118]:
data['Month'].unique()

array([12,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11], dtype=int64)

In [119]:
SeasonsDict = {'Winter':[12,1,2],'Spring':[3,4,5],'Summer':[6,7,8],'Fall':[9,10,11]}
data['Season'] = data['Month'].apply(lambda x : [i for i in list(
    SeasonsDict.keys()) if int(x) in SeasonsDict[i]][0])
data['Season'].value_counts()

Fall      189441
Winter    126766
Summer    108375
Spring    100296
Name: Season, dtype: int64

In [120]:
data[data['Season']=='Fall']['Month'].unique()

array([ 9, 10, 11], dtype=int64)

In [121]:
data[data['Season']=='Summer']['Month'].unique()

array([6, 7, 8], dtype=int64)

In [122]:
data

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Member,Year,Month,Day,Hour,WeekDay,WeekDayCase,DayPart,Desc Char Length,Desc Word Length,Repeated Word : bag,Repeated Word : red,Repeated Word : heart,Repeated Word : retrospot,Repeated Word : vintage,Repeated Word : design,Repeated Word : pink,Repeated Word : christmas,Repeated Word : box,Repeated Word : kitchen,Continent,Latt,Long,TotalPrice,Quantity Level,UnitPrice Level,TotalPriceLevel,Season
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay,Morning,34,5,0,0,1,0,0,0,0,0,0,0,Europe,51.507336,-0.127650,15.30,Low,Medium,Medium,Winter
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay,Morning,19,3,0,0,0,0,0,0,0,0,0,0,Europe,51.507336,-0.127650,20.34,Low,Medium,Expensive,Winter
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay,Morning,30,5,0,0,1,0,0,0,0,0,0,0,Europe,51.507336,-0.127650,22.00,Low,Medium,Expensive,Winter
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay,Morning,35,6,0,0,0,0,0,0,0,0,0,0,Europe,51.507336,-0.127650,20.34,Low,Medium,Expensive,Winter
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,1,2010,12,1,8,Wednesday,WeekDay,Morning,30,5,0,1,1,0,0,0,0,0,0,0,Europe,51.507336,-0.127650,20.34,Low,Medium,Expensive,Winter
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
524873,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France,1,2011,12,9,12,Friday,WeekDay,Noon,27,5,0,0,0,0,0,0,0,0,0,0,Europe,48.853495,2.348391,10.20,Medium,Cheap,Medium,Winter
524874,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France,1,2011,12,9,12,Friday,WeekDay,Noon,28,4,0,0,0,0,0,0,0,0,0,0,Europe,48.853495,2.348391,12.60,Low,Medium,Medium,Winter
524875,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France,1,2011,12,9,12,Friday,WeekDay,Noon,29,4,0,0,0,0,0,0,0,0,0,0,Europe,48.853495,2.348391,16.60,Low,Medium,Medium,Winter
524876,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France,1,2011,12,9,12,Friday,WeekDay,Noon,31,4,0,0,0,0,0,0,0,0,0,0,Europe,48.853495,2.348391,16.60,Low,Medium,Medium,Winter


# 5 ) Data Visualization