In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from joblib import parallel_backend
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier


In [2]:
df = pd.read_csv('/content/styles.csv',on_bad_lines='skip')

In [None]:
df.head()

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011.0,Casual,Manchester United Men Solid Black Track Pants
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Casual,Puma Men Grey T-shirt


In [3]:

label_encoders = {}
for column in ['gender', 'masterCategory', 'subCategory', 'articleType', 'baseColour', 'season', 'usage','year','productDisplayName']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

df = df.dropna()# drop null val
print(label_encoders)


{'gender': LabelEncoder(), 'masterCategory': LabelEncoder(), 'subCategory': LabelEncoder(), 'articleType': LabelEncoder(), 'baseColour': LabelEncoder(), 'season': LabelEncoder(), 'usage': LabelEncoder(), 'year': LabelEncoder(), 'productDisplayName': LabelEncoder()}


In [4]:
X = df[['gender', 'masterCategory', 'subCategory', 'season', 'year', 'productDisplayName','baseColour']]
y_articleType = df['articleType']
y_usage = df['usage']

In [6]:
min_samples = 2
articleType_counts = y_articleType.value_counts()
usage_counts = y_usage.value_counts()

filtered_articleType = articleType_counts[articleType_counts >= min_samples].index
filtered_usage = usage_counts[usage_counts >= min_samples].index

filtered_df = df[df['articleType'].isin(filtered_articleType) &
                 df['usage'].isin(filtered_usage)]

In [7]:
X = filtered_df[['gender', 'masterCategory', 'subCategory', 'season', 'year','baseColour']]
y_articleType = filtered_df['articleType']
y_usage = filtered_df['usage']

In [8]:
X_train, X_test, y_train_articleType, y_test_articleType = train_test_split(X, y_articleType, test_size=0.2, random_state=42, stratify=y_articleType)
_, _, y_train_usage, y_test_usage = train_test_split(X, y_usage, test_size=0.2, random_state=42, stratify=y_usage)

In [9]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [10]:
rf = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [11]:
rf_articleType = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=rf,
    cv=StratifiedKFold(n_splits=5),
    n_jobs=-1,
    verbose=2
)

In [12]:
cv = StratifiedKFold(n_splits=5)
with parallel_backend('threading'):

    grid_search_articleType = GridSearchCV(estimator=rf, param_grid=param_grid, cv=cv, n_jobs=-1, verbose=2)
    grid_search_articleType.fit(X_train, y_train_articleType)
    best_rf_articleType = grid_search_articleType.best_estimator_


    grid_search_usage = GridSearchCV(estimator=rf, param_grid=param_grid, cv=cv, n_jobs=-1, verbose=2)
    grid_search_usage.fit(X_train, y_train_usage)
    best_rf_usage = grid_search_usage.best_estimator_

Fitting 5 folds for each of 72 candidates, totalling 360 fits




[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   6.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   6.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   3.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   3.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   3.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   8.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   8.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   7.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   7.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estim

In [13]:
y_pred_articleType = best_rf_articleType.predict(X_test)
y_pred_usage = best_rf_usage.predict(X_test)

# Evaluate
accuracy_articleType = accuracy_score(y_test_articleType, y_pred_articleType)
accuracy_usage = accuracy_score(y_test_usage, y_pred_usage)

print(f'Accuracy for articleType: {accuracy_articleType:.2f}')
print(f'Accuracy for usage: {accuracy_usage:.2f}')

Accuracy for articleType: 0.69
Accuracy for usage: 0.77


In [14]:
def decode_predictions(predictions, column):
    return label_encoders[column].inverse_transform(predictions)

In [49]:
# Example data to predict
example_data = {
    'gender': 'Men',
    'masterCategory': 'Apparel',
    'subCategory': 'Bottomwear',
    'season': 'Summer',
    'year': 2016,
    'baseColour': 'Pink'
}

# Create a DataFrame from the example data
example_df = pd.DataFrame([example_data])

# Encode example data using label encoders
for column in example_df.columns:
    if column in label_encoders:
        le = label_encoders[column]
        example_df[column] = le.transform(example_df[column])

# Ensure the same order of columns as used in training
encoded_example = example_df[['gender', 'masterCategory', 'subCategory', 'season', 'year','baseColour']]

# Scale the numerical feature
encoded_example_scaled = scaler.transform(encoded_example)


In [50]:
# Predictions
predicted_articleType = best_rf_articleType.predict(encoded_example_scaled)
predicted_usage = best_rf_usage.predict(encoded_example_scaled)

# Decode predictions
def decode_predictions(predictions, column_name):
    """
    Decode the numerical predictions back to original labels using the label encoders.

    :param predictions: List or array of numerical predictions.
    :param column_name: The name of the column to decode.
    :return: List of decoded labels.
    """
    if column_name in label_encoders:
        le = label_encoders[column_name]
        return le.inverse_transform(predictions)
    else:
        raise ValueError(f"Label encoder for {column_name} not found.")

decoded_articleType = decode_predictions(predicted_articleType, 'articleType')

decoded_usage = decode_predictions(predicted_usage, 'usage')

# Print predictions
print(f"Predicted Article Type: {decoded_articleType[0]}")
print(f"Predicted Usage: {decoded_usage[0]}")


Predicted Article Type: Shorts
Predicted Usage: Casual
