In [17]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from joblib import parallel_backend
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

In [3]:
df = pd.read_csv('/content/styles.csv',on_bad_lines='skip')

In [5]:
label_encoders = {}
for column in ['gender', 'masterCategory', 'subCategory', 'articleType', 'baseColour', 'season', 'usage','year','productDisplayName']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

df = df.dropna()# drop null val
print(label_encoders)

{'gender': LabelEncoder(), 'masterCategory': LabelEncoder(), 'subCategory': LabelEncoder(), 'articleType': LabelEncoder(), 'baseColour': LabelEncoder(), 'season': LabelEncoder(), 'usage': LabelEncoder(), 'year': LabelEncoder(), 'productDisplayName': LabelEncoder()}


In [8]:
X = df[['gender', 'masterCategory', 'subCategory', 'season', 'year', 'productDisplayName','baseColour']]
y_articleType = df['articleType']
y_usage = df['usage']

In [9]:
min_samples = 2
articleType_counts = y_articleType.value_counts()
usage_counts = y_usage.value_counts()

filtered_articleType = articleType_counts[articleType_counts >= min_samples].index
filtered_usage = usage_counts[usage_counts >= min_samples].index

filtered_df = df[df['articleType'].isin(filtered_articleType) &
                 df['usage'].isin(filtered_usage)]

In [10]:
X = filtered_df[['gender', 'masterCategory', 'subCategory', 'season', 'year', 'productDisplayName','baseColour']]
y_articleType = filtered_df['articleType']
y_usage = filtered_df['usage']

In [11]:
le_articleType = LabelEncoder()
y_articleType_encoded = le_articleType.fit_transform(y_articleType)

In [12]:


X_train, X_test, y_train_articleType, y_test_articleType = train_test_split(X, y_articleType_encoded, test_size=0.2, random_state=42, stratify=y_articleType_encoded)
_, _, y_train_usage, y_test_usage = train_test_split(X, y_usage, test_size=0.2, random_state=42, stratify=y_usage)

In [13]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.1, 0.01],
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'gamma': [0]
}

In [15]:
rf_articleType = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid_rf,
    cv=StratifiedKFold(n_splits=5),
    n_jobs=-1,
    verbose=2
)

In [18]:
xgb_articleType = RandomizedSearchCV(
    estimator=XGBClassifier(random_state=42),
    param_distributions=param_grid_xgb,
    n_iter=10,  #
    cv=StratifiedKFold(n_splits=5),
    n_jobs=-1,
    verbose=2,
    random_state=42
)

In [19]:
with parallel_backend('threading'):
    rf_articleType.fit(X_train_scaled, y_train_articleType)
    xgb_articleType.fit(X_train_scaled, y_train_articleType, eval_set=[(X_test_scaled, y_test_articleType)], early_stopping_rounds=10, verbose=False)


best_rf_articleType = rf_articleType.best_estimator_
best_xgb_articleType = xgb_articleType.best_estimator_

Fitting 5 folds for each of 24 candidates, totalling 120 fits




[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  18.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  18.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  21.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  21.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  21.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  36.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  29.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  28.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  29.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estim



Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=0.8; total time= 1.4min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=0.8; total time= 1.5min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=0.8; total time= 1.4min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=0.8; total time= 1.5min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=0.8; total time= 1.4min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=0.8; total time= 3.0min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=0.8; total time= 2.9min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=0.8; total time= 2.9min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=0.8; total time= 2.9min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.8; total time= 1.6min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=0.8; total time= 2.9min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.8; total time= 1.6min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.8; total time= 1.5min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.8; total time= 1.5min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.8; total time= 1.5min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=5, n_estimators=200, subsample=0.8; total time= 3.1min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=5, n_estimators=200, subsample=0.8; total time= 3.0min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=5, n_estimators=200, subsample=0.8; total time= 2.8min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=5, n_estimators=200, subsample=0.8; total time= 3.1min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time= 1.3min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=5, n_estimators=200, subsample=0.8; total time= 2.9min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time= 1.4min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time= 1.3min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time= 1.3min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time= 1.3min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time= 2.7min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time= 2.6min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time= 2.6min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time= 2.6min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=0.8; total time= 1.4min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time= 2.6min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=0.8; total time= 1.4min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=0.8; total time= 1.4min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=0.8; total time= 1.4min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=0.8; total time= 1.4min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=5, n_estimators=200, subsample=0.8; total time= 2.8min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=5, n_estimators=200, subsample=0.8; total time= 2.8min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=5, n_estimators=200, subsample=0.8; total time= 2.8min




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=5, n_estimators=200, subsample=0.8; total time= 2.8min
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=5, n_estimators=200, subsample=0.8; total time= 1.3min




In [23]:
y_pred_rf_articleType = best_rf_articleType.predict(X_test_scaled)
y_pred_xgb_articleType = best_xgb_articleType.predict(X_test_scaled)
accuracy_rf_articleType = accuracy_score(y_test_articleType, y_pred_rf_articleType)
accuracy_xgb_articleType = accuracy_score(y_test_articleType, y_pred_xgb_articleType)
print(f'Accuracy for RandomForestClassifier (articleType): {accuracy_rf_articleType:.2f}')
print(f'Accuracy for XGBoostClassifier (articleType): {accuracy_xgb_articleType:.2f}')


Accuracy for RandomForestClassifier (articleType): 0.82
Accuracy for XGBoostClassifier (articleType): 0.79
