## Solution for Lab 3

Data Mining course <br>
Student: Danis Alukaev <br>
Email: d.alukaev@innopolis.university <br> 
Group: B19-DS-01 <br>

### 0. Prerequisites

In [1]:
import pandas as pd
import numpy as np
import itertools

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder
from catboost import CatBoostClassifier
from sklearn.metrics import balanced_accuracy_score

import warnings
warnings.filterwarnings("ignore")

### 1. Data Preparation

In [27]:
def load_data(sample=None):
    df = pd.read_csv("./data/completed_orders.csv")
    if sample:
        size = int(len(df) * sample)
        df = df.sample(n=size)
    return df

In [28]:
def get_map():
    data = pd.read_csv("./data/completed_orders.csv")
    df = data.copy()[["StockCode", "Description"]]
    df.drop_duplicates(inplace=True)
    df = df.set_index("StockCode")
    return df

In [32]:
def get_all_combinations(df, sample=None):
    all_stockcodes, all_customers = set(df["StockCode"].to_numpy()), set(df["CustomerID"].to_numpy())
    all_combinations = np.array(list(itertools.product(all_customers, all_stockcodes)))
    all_combinations = pd.DataFrame(data=all_combinations, columns=["CustomerID", "StockCode"])
    if sample:
        size = int(len(all_combinations) * sample)
        all_combinations = all_combinations.sample(n=size)
    return all_combinations

In [8]:
def split_data(df, all_combinations):
    train, test = train_test_split(df, test_size=0.2, shuffle=False)
    train["Reorder"], test["Reorder"] = 1, 1
    train = pd.merge(train, all_combinations, how="outer", on=["CustomerID", "StockCode"]).fillna(0)
    test = pd.merge(test, all_combinations, how="outer", on=["CustomerID", "StockCode"]).fillna(0)
    train.drop_duplicates(inplace=True)
    test.drop_duplicates(inplace=True)
    train = train.sample(frac=1.0)
    test = test.sample(frac=1.0)
    return train, test

In [38]:
df = load_data(sample=0.05)
data = df.copy()[["CustomerID", "StockCode"]]
all_combinations = get_all_combinations(data, sample=0.012)
train, test = split_data(data, all_combinations)

### 2. PCA

In [7]:
def encode_data(df):
    X, y = df[["CustomerID", "StockCode"]], df["Reorder"]
    all_stockcodes, all_customers = set(df["StockCode"].to_numpy()), set(df["CustomerID"].to_numpy())
    
    encoder_customer = OneHotEncoder(handle_unknown='ignore', sparse=False)
    encoder_stockcodes = OneHotEncoder(handle_unknown='ignore', sparse=False)
    
    encoder_customer.fit(np.array(list(all_customers))[..., np.newaxis])
    encoder_stockcodes.fit(np.array(list(all_stockcodes))[..., np.newaxis])
    
    customer_encoded = pd.DataFrame(encoder_customer.transform(X["CustomerID"][..., np.newaxis]))
    customer_encoded.rename(columns={i: f"c{i}" for i in range(len(encoder_customer.categories_[0]))}, inplace=True)
    
    stockcodes_encoded = pd.DataFrame(encoder_stockcodes.transform(X["StockCode"][..., np.newaxis]))
    stockcodes_encoded.rename(columns={i: f"s{i}" for i in range(len(encoder_stockcodes.categories_[0]))}, inplace=True)
    
    X = pd.concat([customer_encoded, stockcodes_encoded], axis=1)
    
    del stockcodes_encoded
    del customer_encoded
    
    return X, y

In [8]:
X_train, y_train = encode_data(train)

In [9]:
X_test, y_test = encode_data(test)

In [10]:
pca = PCA(n_components=12)

X_train = pca.fit_transform(X_train)
X_test = pca.fit_transform(X_test)

In [11]:
pca.explained_variance_

array([0.00810234, 0.00781196, 0.00685646, 0.00655999, 0.00644618,
       0.00574809, 0.00564132, 0.00559729, 0.00540488, 0.00521254,
       0.00508378, 0.00501808])

### 3. Building Model

In [45]:
from sklearn.svm import SVC

model = SVC(kernel='rbf')
model.fit(X_train,
          y_train)
y_pred = model.predict(X_test)

In [14]:
model = CatBoostClassifier(iterations=10000,
                           task_type="GPU",
                           learning_rate=0.03,
                           depth=6,
                           devices='0',
                           eval_metric='AUC')

In [15]:
model.fit(X_train,
          y_train,
          verbose=True)

0:	learn: 0.7874831	total: 31.1ms	remaining: 5m 10s
1:	learn: 0.7960962	total: 60.6ms	remaining: 5m 3s
2:	learn: 0.7987550	total: 90.1ms	remaining: 5m
3:	learn: 0.8033640	total: 120ms	remaining: 4m 59s
4:	learn: 0.8050309	total: 149ms	remaining: 4m 58s
5:	learn: 0.8057235	total: 177ms	remaining: 4m 54s
6:	learn: 0.8062866	total: 205ms	remaining: 4m 52s
7:	learn: 0.8071011	total: 233ms	remaining: 4m 51s
8:	learn: 0.8071733	total: 261ms	remaining: 4m 49s
9:	learn: 0.8082732	total: 288ms	remaining: 4m 47s
10:	learn: 0.8085523	total: 316ms	remaining: 4m 47s
11:	learn: 0.8087152	total: 344ms	remaining: 4m 45s
12:	learn: 0.8088478	total: 371ms	remaining: 4m 44s
13:	learn: 0.8090649	total: 398ms	remaining: 4m 43s
14:	learn: 0.8097957	total: 426ms	remaining: 4m 43s
15:	learn: 0.8100431	total: 453ms	remaining: 4m 42s
16:	learn: 0.8104689	total: 480ms	remaining: 4m 42s
17:	learn: 0.8104995	total: 508ms	remaining: 4m 41s
18:	learn: 0.8104457	total: 535ms	remaining: 4m 40s
19:	learn: 0.8106951	tot

<catboost.core.CatBoostClassifier at 0x7fe058a3b470>

In [16]:
y_pred = model.predict(X_test)

In [46]:
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
result = f"Balanced accuracy: {balanced_accuracy:.2f}"
print(result)

Balanced accuracy: 0.50


### 4. Inference

In [43]:
def predict_cart(customer_id):
    all_stockcodes, all_customers = set(df["StockCode"].to_numpy()), set(df["CustomerID"].to_numpy())
    X = np.array(list(itertools.product([customer_id], all_stockcodes)))
    X = pd.DataFrame(data=X, columns=["CustomerID", "StockCode"])
    
    encoder_customer = OneHotEncoder(handle_unknown='ignore', sparse=False)
    encoder_stockcodes = OneHotEncoder(handle_unknown='ignore', sparse=False)
    
    encoder_customer.fit(np.array(list(all_customers))[..., np.newaxis])
    encoder_stockcodes.fit(np.array(list(all_stockcodes))[..., np.newaxis])
    
    customer_encoded = pd.DataFrame(encoder_customer.transform(X["CustomerID"][..., np.newaxis]))
    customer_encoded.rename(columns={i: f"c{i}" for i in range(len(encoder_customer.categories_[0]))}, inplace=True)
    
    stockcodes_encoded = pd.DataFrame(encoder_stockcodes.transform(X["StockCode"][..., np.newaxis]))
    stockcodes_encoded.rename(columns={i: f"s{i}" for i in range(len(encoder_stockcodes.categories_[0]))}, inplace=True)
    
    X = pd.concat([customer_encoded, stockcodes_encoded], axis=1)
    
    cart_ = model.predict(X)
    
    mapping = get_map()
    
    indices = np.nonzero(cart_)[0]
    cart = []
    for idx in indices:
        index = encoder_stockcodes.categories_[0][idx]
        product = mapping.iloc[mapping.index.get_loc(index)]
        if len(product) > 1:
            product = product.iloc[0]

        cart.append(product.Description)
    
    return cart 

In [44]:
predict_cart(17799)

['inflatable political globe ',
 'doggy rubber',
 'colouring pencils brown tube',
 'asstd design racing car pen',
 'sandalwood fan',
 'red paper parasol',
 'edwardian parasol black',
 'fairy cake design umbrella',
 'small folding scissor(pointed edge)',
 'large chinese style scissor',
 'mini highlighter pens',
 'pop art push down rubber ',
 'teatime push down rubber',
 'wrap pink fairy cakes ',
 'wrap english rose ',
 'pink strawberry handbag ',
 'cartoon  pencil sharpeners',
 'rattle snake eggs',
 'kitty pencil erasers',
 'maxwell 2 tone blue 60 page photo a',
 'swirly circular rubbers in bag',
 's/3 pot pouri cushions blue colours',
 'origami sandlewood incense/cand set',
 'namaste swagat incense',
 'porcelain budah incense holder',
 'dragons blood incense',
 'vanilla incense in tin',
 "flower fairy,5 summer b'draw liners",
 'blue glass gems in bag',
 'blue stones on wire for candle',
 'essential balm 3.5g tin in envelope',
 'white tall porcelain t-light holder',
 'queen of the skies