Author: Ahmed Sobhi

Creation_date: 3th Aug 2023

Objective: Data Modeling Experimental using W&B Tools

# Importing required libararies and packages

In [36]:
import warnings
warnings.filterwarnings('ignore')

import os

import pandas as pd
import numpy as np

# Used for visulization
import matplotlib.pyplot as plt

# For iteration visulization purpose
from tqdm import tqdm

# Import defined tools script
import sys

# adding tools script into system path
# sys.path.insert(0, '')
from tools import featureengineering, datapreprocessing, pipelinetransformers

# Used for datasplitting, modeling
from scipy import stats
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# To save the model locally
import joblib

# Import wandb
import wandb

# Data loading

In [37]:
# Loading raw dataset
df_raw = pd.read_csv('data/data.csv')

df_raw.head()

Unnamed: 0,country,article,sales,regular_price,current_price,ratio,retailweek,promo1,promo2,customer_id,...,style,sizes,gender,rgb_r_main_col,rgb_g_main_col,rgb_b_main_col,rgb_r_sec_col,rgb_g_sec_col,rgb_b_sec_col,label
0,Germany,YN8639,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,...,slim,"xxs,xs,s,m,l,xl,xxl",women,205,104,57,255,187,255,0
1,Germany,YN8639,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,...,regular,"xxs,xs,s,m,l,xl,xxl",women,188,238,104,255,187,255,0
2,Germany,YN8639,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,...,regular,"xxs,xs,s,m,l,xl,xxl",women,205,173,0,255,187,255,0
3,Germany,YN8639,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,...,regular,"xxs,xs,s,m,l,xl,xxl",kids,205,140,149,164,211,238,0
4,Germany,YN8639,28,5.95,3.95,0.663866,2016-03-27,0,0,1003.0,...,regular,"xxs,xs,s,m,l,xl,xxl",women,138,43,226,164,211,238,0


In [38]:
df_raw.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sales,9999.0,57.820882,89.77373,1.0,10.0,26.0,62.5,789.0
regular_price,9999.0,52.311736,33.547643,3.95,26.45,43.95,76.95,197.95
current_price,9999.0,28.201225,21.803522,1.95,12.45,22.95,36.95,140.95
ratio,9999.0,0.545694,0.191688,0.298246,0.354839,0.532554,0.694823,1.0
promo1,9999.0,0.057006,0.231865,0.0,0.0,0.0,0.0,1.0
promo2,9999.0,0.006001,0.077235,0.0,0.0,0.0,0.0,1.0
customer_id,9999.0,2720.894489,1941.263157,3.0,1011.5,1989.0,4618.5,5988.0
cost,9999.0,6.516782,3.915039,1.29,2.29,5.2,9.6,13.29
rgb_r_main_col,9999.0,161.40224,39.793297,79.0,138.0,181.0,205.0,205.0
rgb_g_main_col,9999.0,139.60026,63.647856,26.0,104.0,148.0,181.0,238.0


# Train Test split

In [39]:
target_col = 'label'
stratify_col = 'country'

# Select features only
df_features = df_raw.drop(target_col, axis=1)

df_y = df_raw[target_col]

x_temp, x_test, y_temp, y_test = train_test_split(
    df_features,
    df_y,
    test_size=0.2,
    random_state=42,
    stratify=df_features[stratify_col]
)

df_temp = pd.concat([x_temp, y_temp], axis=1)

# Apply data preprocessing
df_temp = datapreprocessing.data_preprocess(df_temp)

# Feature engineering
df_temp = featureengineering.feature_engineereing(df_temp)

x_temp = df_temp.drop(target_col, axis=1)
y_temp = df_temp[target_col]

x_train, x_val, y_train, y_val = train_test_split(
    x_temp,
    y_temp,
    test_size=0.2,
    random_state=42,
    stratify=x_temp[stratify_col]
)

In [40]:
# Diplay shapes
print('Train set shape:', x_train.shape, y_train.shape)
print('Validation set shape:', x_val.shape, y_val.shape)
print('Test set shape:', x_test.shape, y_test.shape)

Train set shape: (5447, 21) (5447,)
Validation set shape: (1362, 21) (1362,)
Test set shape: (2000, 23) (2000,)


# Preprocessing and Features Engineering

# Pipeline Steps

In [41]:
# Extract numerical|Categorical features
numeric_features = x_train.select_dtypes('number').columns.tolist()

categorical_features = x_train.select_dtypes('object').columns.tolist()

imputer_numeric = SimpleImputer(strategy='mean')
imputer_categorical = SimpleImputer(strategy='most_frequent')

numeric_transformer = Pipeline(
    [
        ('impute', imputer_numeric),
        ('scaler', StandardScaler())
    ]
)

categorical_transformer = Pipeline(
    [
        ('impute', imputer_categorical),
        ('ohc', OneHotEncoder(handle_unknown='ignore'))
    ]
)

preprocessor = ColumnTransformer(
    [
        ('numericals', numeric_transformer, numeric_features),
        ('Categorical', categorical_transformer, categorical_features)
    ],
    remainder='drop',
    n_jobs=-1
)

In [42]:
# # # Testing
# preprocessor.fit(x_train)

# preprocessor.transform(x_train)

# Model Pipeline

In [43]:
def train_model(process_pipeline, clf, config):
    "Train a model with given configuration"

    global x_train, y_train, x_val, y_val

    # Start wandb run
    wandb.init(project='sports_store', config=config)

    # Define model
    model = clf(**config)

    # Add to pipeline
    model_pipeline = Pipeline(
        steps=[
            ('col_trans', process_pipeline),
            ('model', model)
        ]
    )

    # Fit the model to training dataset
    model_pipeline.fit(x_train, y_train)
    
    # calculate the accuracy
    pred_train = model_pipeline.predict(x_train)

    pred_val = model_pipeline.predict(x_val)

    acc_train = accuracy_score(y_train, pred_train)
    
    acc_val = accuracy_score(y_val, pred_val)

    conf_matrix_train = confusion_matrix(y_train, pred_train)

    conf_matrix_val = confusion_matrix(y_val, pred_val)
    
    # Log metrices over time to visulaization performance
    wandb.log({
        'train_acc':acc_train,
        'val_acc':acc_val,
        'train_confusion_matrix':conf_matrix_train,
        'val_confusion_matrix':conf_matrix_val
    })

    # Finish wandb
    wandb.finish()

Try to rerun the following cell, with changing the configuration parameters.

In [45]:
initial_lr = 0.05

config = {
    'objective':'binary:logistic',
    'colsample_bytree':0.5,
    'learning_rate':initial_lr,
    'max_depth':10,
    'min_child_weight':1,
    'n_estimators':1000,
    'subsample':0.99,
    'verbosity':1,
    'n_jobs':-1
    }

train_model(preprocessor, XGBClassifier, config)




VBox(children=(Label(value='0.002 MB of 0.011 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.212092…

0,1
train_acc,▁
val_acc,▁

0,1
train_acc,1.0
val_acc,0.83113


![image](https://github.com/AhmedYousriSobhi/aCupOfTea/assets/66730765/bf290d38-4937-4f8c-9c04-929987ba70d8)
