In [40]:
#! pip install xgboost
#!pip install TextBlob
#!pip install nltk
#!pip install tensorflow[and-cuda]
#!pip install transformers

Collecting transformers
  Downloading transformers-4.37.2-py3-none-any.whl.metadata (129 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.4/129.4 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Downloading huggingface_hub-0.20.3-py3-none-any.whl.metadata (12 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Downloading transformers-4.37.2-py3-none-any.whl (8.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading huggingface_hub-0.20.3-py3-none-any.whl (330 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

In [38]:
from datetime import datetime, timedelta,date
import pandas as pd
from sklearn.metrics import classification_report,confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from __future__ import division
from sklearn.cluster import KMeans
import os

#do not show warnings
import warnings
warnings.filterwarnings("ignore")

#import machine learning related libraries
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score, train_test_split

In [24]:
import nltk
import re

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('brown')

from textblob import TextBlob
from textblob.wordnet import Synset

[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package brown to /home/jupyter/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [4]:
home_dir = '/home/jupyter/data_science_challenges/business_data_science'

data_dir = os.path.join(home_dir, 'data/external/OnlineRetail.csv')

df = pd.read_csv(data_dir, encoding="unicode_escape").drop_duplicates().dropna().reset_index(drop=True)

# Changing data types
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
# time series variables
df['date'] = pd.to_datetime(df.InvoiceDate).dt.date
df['year'] = pd.to_datetime(df.InvoiceDate).dt.year
df['month'] = pd.to_datetime(df.InvoiceDate).dt.month
df['day'] = pd.to_datetime(df.InvoiceDate).dt.day
df['yearmo'] = pd.to_datetime(df.InvoiceDate).dt.strftime('%Y%m')
# new variable

df['Revenue'] = df['Quantity'] * df['UnitPrice']

df = df.loc[df.Country == 'United Kingdom']

In [5]:
def pp(func):
    print('PeePee')
    func()
    
@pp
def pp2():
    print('PeePee2')

PeePee
PeePee2


In [6]:
def get_recency_features(df):
    
    df['Recency'] =  (pd.to_datetime(df.InvoiceDate).max() - df.InvoiceDate).dt.days 
    r_df = df.groupby(['CustomerID']).agg({'InvoiceDate': 'min'}).reset_index()\
    .merge(df[['CustomerID', 'InvoiceDate','Recency']], on=['CustomerID','InvoiceDate'], how='inner')\
    .drop_duplicates().reset_index(drop=True).drop(['InvoiceDate'], axis=1).rename(columns={'Recency':'RecencyToDate'})
    
    return r_df

def get_frequency_features(df):
    
    f_df = df[['InvoiceDate','CustomerID']].drop_duplicates().sort_values(by=['CustomerID','InvoiceDate'], ascending=True).reset_index(drop=True)

    #Creating difference between invoice dates per person
    f_df['Offset'] = f_df.groupby(['CustomerID'])['InvoiceDate'].diff().dt.seconds

    # Creating Count per user
    f_df = f_df.merge(\
        f_df.groupby(['CustomerID']).agg(Count = ('InvoiceDate', 'count')).reset_index()\
               ,on=['CustomerID'], how='inner')

    # Finding folks who only had one interaction and giving them an Offset that is differenced from the latest day in the dataset
    f_df.loc[f_df.Count == 1, 'Offset'] = (f_df.InvoiceDate.max() - f_df.InvoiceDate).dt.seconds

    # Dropping the first occurance of a invoicedate.  Not needed to make agg.
    f_df = f_df.loc[~(f_df.Offset.isna())]

    f_df = f_df.groupby(['CustomerID']).agg(FrequencyToDate = ('Offset','median')).reset_index()
    
    return f_df
    

def get_monetary_features(df):
    
    m_df = df[['CustomerID','Revenue']].groupby('CustomerID').agg(RevenueToDate = ('Revenue', 'sum')).reset_index()
    
    return m_df


def get_rfm_features(df):
    
    new_df = df.loc[~df.InvoiceNo.str.contains('C')].copy()
    
    r_df = get_recency_features(new_df)
    f_df = get_frequency_features(new_df)
    m_df = get_monetary_features(new_df)
    
    final_df = r_df.merge(f_df, on='CustomerID', how='inner').merge(m_df, on='CustomerID', how='inner')
    
    return final_df
    

In [7]:
work_df = df.merge(get_rfm_features(df), on='CustomerID', how='inner')

In [8]:
work_df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,date,year,month,day,yearmo,Revenue,RecencyToDate,FrequencyToDate,RevenueToDate
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,2010-12-01,2010,12,1,201012,15.30,373,60.0,5391.21
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,2010-12-01,2010,12,1,201012,20.34,373,60.0,5391.21
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,2010-12-01,2010,12,1,201012,22.00,373,60.0,5391.21
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,2010-12-01,2010,12,1,201012,20.34,373,60.0,5391.21
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,2010-12-01,2010,12,1,201012,20.34,373,60.0,5391.21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
356642,581416,22809,SET OF 6 T-LIGHTS SANTA,1,2011-12-08 14:58:00,2.95,14569.0,United Kingdom,2011-12-08,2011,12,8,201112,2.95,0,78660.0,227.39
356643,581416,22807,SET OF 6 T-LIGHTS TOADSTOOLS,2,2011-12-08 14:58:00,1.25,14569.0,United Kingdom,2011-12-08,2011,12,8,201112,2.50,0,78660.0,227.39
356644,581416,72349B,SET/6 PURPLE BUTTERFLY T-LIGHTS,1,2011-12-08 14:58:00,2.10,14569.0,United Kingdom,2011-12-08,2011,12,8,201112,2.10,0,78660.0,227.39
356645,581416,22809,SET OF 6 T-LIGHTS SANTA,2,2011-12-08 14:58:00,2.95,14569.0,United Kingdom,2011-12-08,2011,12,8,201112,5.90,0,78660.0,227.39


In [9]:
def interval_purchase(df):

    t_df =df[['CustomerID','InvoiceNo','InvoiceDate']]\
    .drop_duplicates()\
    .sort_values(by=['CustomerID','InvoiceDate'], ascending=True)\
    .groupby(['CustomerID']).agg({'InvoiceDate': 'diff'}).apply(lambda x: x.dt.days).fillna(0)\
    .reset_index().rename(columns={'index':'CustomerID', 'InvoiceDate':'IntervalPurchase'}).fillna(0)
    
    df = df.merge(t_df, on='CustomerID', how='left')
    
    return df
    

In [10]:
interval_purchase(work_df)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,date,year,month,day,yearmo,Revenue,RecencyToDate,FrequencyToDate,RevenueToDate,IntervalPurchase
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,2010-12-01,2010,12,1,201012,15.30,373,60.0,5391.21,
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,2010-12-01,2010,12,1,201012,20.34,373,60.0,5391.21,
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,2010-12-01,2010,12,1,201012,22.00,373,60.0,5391.21,
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,2010-12-01,2010,12,1,201012,20.34,373,60.0,5391.21,
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,2010-12-01,2010,12,1,201012,20.34,373,60.0,5391.21,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
356642,581416,22809,SET OF 6 T-LIGHTS SANTA,1,2011-12-08 14:58:00,2.95,14569.0,United Kingdom,2011-12-08,2011,12,8,201112,2.95,0,78660.0,227.39,
356643,581416,22807,SET OF 6 T-LIGHTS TOADSTOOLS,2,2011-12-08 14:58:00,1.25,14569.0,United Kingdom,2011-12-08,2011,12,8,201112,2.50,0,78660.0,227.39,
356644,581416,72349B,SET/6 PURPLE BUTTERFLY T-LIGHTS,1,2011-12-08 14:58:00,2.10,14569.0,United Kingdom,2011-12-08,2011,12,8,201112,2.10,0,78660.0,227.39,
356645,581416,22809,SET OF 6 T-LIGHTS SANTA,2,2011-12-08 14:58:00,2.95,14569.0,United Kingdom,2011-12-08,2011,12,8,201112,5.90,0,78660.0,227.39,


In [11]:
work_df.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country', 'date', 'year', 'month', 'day',
       'yearmo', 'Revenue', 'RecencyToDate', 'FrequencyToDate',
       'RevenueToDate'],
      dtype='object')

In [184]:
 w = work_df.Description.unique()

In [187]:
w

array(['WHITE HANGING HEART T-LIGHT HOLDER', 'WHITE METAL LANTERN',
       'CREAM CUPID HEARTS COAT HANGER', ..., 'LETTER "U" BLING KEY RING',
       'LETTER "W" BLING KEY RING', 'LETTER "Z" BLING KEY RING'],
      dtype=object)

In [188]:
w = [TextBlob(i) for i in w]
w = [i.ngrams(n=3) for i in w]
l = []
for i in w:
    l.extend(i)


In [190]:
pd.DataFrame(pd.DataFrame(l).value_counts()).head(50)
    

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
0,1,2,Unnamed: 3_level_1
SET,OF,4,31
SET,OF,6,27
BLING,KEY,RING,25
HOT,WATER,BOTTLE,24
PACK,OF,12,23
SET,OF,3,23
HOME,SWEET,HOME,21
I,LOVE,LONDON,16
TRAVEL,CARD,WALLET,13
50,'S,CHRISTMAS,12


[WordList(['WHITE', 'HANGING', 'HEART']),
 WordList(['HANGING', 'HEART', 'T-LIGHT']),
 WordList(['HEART', 'T-LIGHT', 'HOLDER']),
 WordList(['WHITE', 'METAL', 'LANTERN']),
 WordList(['CREAM', 'CUPID', 'HEARTS']),
 WordList(['CUPID', 'HEARTS', 'COAT']),
 WordList(['HEARTS', 'COAT', 'HANGER']),
 WordList(['KNITTED', 'UNION', 'FLAG']),
 WordList(['UNION', 'FLAG', 'HOT']),
 WordList(['FLAG', 'HOT', 'WATER']),
 WordList(['HOT', 'WATER', 'BOTTLE']),
 WordList(['RED', 'WOOLLY', 'HOTTIE']),
 WordList(['WOOLLY', 'HOTTIE', 'WHITE']),
 WordList(['HOTTIE', 'WHITE', 'HEART']),
 WordList(['SET', '7', 'BABUSHKA']),
 WordList(['7', 'BABUSHKA', 'NESTING']),
 WordList(['BABUSHKA', 'NESTING', 'BOXES']),
 WordList(['GLASS', 'STAR', 'FROSTED']),
 WordList(['STAR', 'FROSTED', 'T-LIGHT']),
 WordList(['FROSTED', 'T-LIGHT', 'HOLDER']),
 WordList(['HAND', 'WARMER', 'UNION']),
 WordList(['WARMER', 'UNION', 'JACK']),
 WordList(['HAND', 'WARMER', 'RED']),
 WordList(['WARMER', 'RED', 'POLKA']),
 WordList(['RED', 'PO

In [137]:
blob = TextBlob(w)

blob.ngrams(n=3)

[WordList(['WHITE', 'HANGING', 'HEART']),
 WordList(['HANGING', 'HEART', 'T-LIGHT']),
 WordList(['HEART', 'T-LIGHT', 'HOLDER']),
 WordList(['T-LIGHT', 'HOLDER', 'WHITE']),
 WordList(['HOLDER', 'WHITE', 'METAL']),
 WordList(['WHITE', 'METAL', 'LANTERN']),
 WordList(['METAL', 'LANTERN', 'CREAM']),
 WordList(['LANTERN', 'CREAM', 'CUPID']),
 WordList(['CREAM', 'CUPID', 'HEARTS']),
 WordList(['CUPID', 'HEARTS', 'COAT']),
 WordList(['HEARTS', 'COAT', 'HANGER']),
 WordList(['COAT', 'HANGER', 'KNITTED']),
 WordList(['HANGER', 'KNITTED', 'UNION']),
 WordList(['KNITTED', 'UNION', 'FLAG']),
 WordList(['UNION', 'FLAG', 'HOT']),
 WordList(['FLAG', 'HOT', 'WATER']),
 WordList(['HOT', 'WATER', 'BOTTLE']),
 WordList(['WATER', 'BOTTLE', 'RED']),
 WordList(['BOTTLE', 'RED', 'WOOLLY']),
 WordList(['RED', 'WOOLLY', 'HOTTIE']),
 WordList(['WOOLLY', 'HOTTIE', 'WHITE']),
 WordList(['HOTTIE', 'WHITE', 'HEART']),
 WordList(['WHITE', 'HEART', 'SET']),
 WordList(['HEART', 'SET', '7']),
 WordList(['SET', '7', 'BA

'WHITE HANGING HEART T-LIGHT HOLDER WHITE METAL LANTERN CREAM CUPID HEARTS COAT HANGER KNITTED UNION FLAG HOT WATER BOTTLE RED WOOLLY HOTTIE WHITE HEART SET 7 BABUSHKA NESTING BOXES GLASS STAR FROSTED T-LIGHT HOLDER HAND WARMER UNION JACK HAND WARMER RED POLKA DOT EDWARDIAN PARASOL RED RETRO COFFEE MUGS ASSORTED SAVE THE PLANET MUG VINTAGE BILLBOARD DRINK ME MUG VINTAGE BILLBOARD LOVE/HATE MUG WOOD 2 DRAWER CABINET WHITE FINISH WOOD S/3 CABINET ANT WHITE FINISH WOODEN PICTURE FRAME WHITE FINISH WOODEN FRAME ANTIQUE WHITE  EDWARDIAN PARASOL BLACK IVORY EMBROIDERED QUILT  JUMBO SHOPPER VINTAGE RED PAISLEY HAND WARMER RED RETROSPOT GIN AND TONIC MUG COLOUR GLASS STAR T-LIGHT HOLDER YOU\'RE CONFUSING ME METAL SIGN  ASSORTED COLOUR BIRD ORNAMENT POPPY\'S PLAYHOUSE BEDROOM  POPPY\'S PLAYHOUSE KITCHEN FELTCRAFT PRINCESS CHARLOTTE DOLL IVORY KNITTED MUG COSY  BOX OF 6 ASSORTED COLOUR TEASPOONS BOX OF VINTAGE JIGSAW BLOCKS  BOX OF VINTAGE ALPHABET BLOCKS HOME BUILDING BLOCK WORD LOVE BUILDING B

In [112]:
"_".join(w for word in test for w in work_df.Description.unique().split())

NameError: name 'test' is not defined

In [None]:
work_df.merge(\
work_df[['CustomerID','InvoiceNo','InvoiceDate']]\
.drop_duplicates()\
.sort_values(by=['CustomerID','InvoiceNo','InvoiceDate'], ascending=True)\
.groupby(['CustomerID','InvoiceNo'])['InvoiceDate'].diff().dt.days.fillna(0)\
.to_frame().reset_index().rename(columns={'index':'CustomerID'})\
, on=['CustomerID','InvoiceNo'], how='left')

In [None]:
train_df = df.loc[df.InvoiceDate < df.InvoiceDate.min() + timedelta(days=30 * 11)]

In [None]:
test_df = df[~df.index.isin(train_df.index)]

In [None]:
test_df.InvoiceDate.min(), test_df.InvoiceDate.max()