In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('cleaned_jeans_data.csv')

In [3]:
df.head()

Unnamed: 0,brand,distress,waist_rise,length,fit,number_of_pockets,stretch,price,rating,number_of_ratings
0,Marks & Spencer,Clean Look,Mid-Rise,Regular,Slim Fit,5,Non Stretchable,1119,4.8,13
1,Levis,Clean Look,Mid-Rise,Regular,Slim Fit,5,Stretchable,1499,4.4,259
2,Urbano Fashion,Clean Look,Mid-Rise,Regular,Slim Fit,5,Stretchable,601,3.9,17100
3,Levis,Clean Look,Mid-Rise,Regular,Tapered Fit,5,Stretchable,1829,4.1,96
4,Jack & Jones,Clean Look,Mid-Rise,Regular,Bootcut,5,Stretchable,2124,4.4,148


In [5]:
df['brand'].unique()

array(['Marks & Spencer', 'Levis', 'Urbano Fashion', 'Jack & Jones',
       'The Indian Garage Co', 'HIGHLANDER', 'Roadster', 'Bene Kleed',
       'Mast & Harbour', 'Hubberholme', 'HERE&NOW', 'Being Human',
       'WROGN', 'Ducati', 'Bewakoof', 'Urbano Plus', 'RARE RABBIT',
       'Kook N Keech', 'Powerlook', 'AD By Arvind',
       'VAN HEUSEN DENIM LABS', 'Blackberrys', 'Nautica',
       'Flying Machine', 'Moda Rapido', 'KETCH', 'Sztori',
       'Forca by Lifestyle', 'Dennis Lingo', 'Styli', 'French Connection',
       'SF JEANS by Pantaloons', 'V-Mart',
       'R.Code by The Roadster Life Co.', 'High Star', 'FREAKINS',
       'Pepe Jeans', 'SELECTED', 'Turtle', 'Red Tape',
       'BEAT LONDON by PEPE JEANS', 'Campus Sutra', 'PARIS HAMILTON',
       'Thomas Scott', 'Mufti', 'MASCLN SASSAFRAS',
       'Louis Philippe Jeans', 'SPYKAR', 'Code 61', 'CINOCCI',
       'Indian Terrain', 'max', 'Style Quotient', 'FCUK', 'RAGZO',
       'Wrangler', 'Aeropostale', 'Mr Bowerbird', 'Jb Just BLACK

In [4]:
# Label Encoding
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

le1 = LabelEncoder()
le2 = LabelEncoder()

df['fit'] = le1.fit_transform(df['fit'])
df['brand'] = le2.fit_transform(df['brand'])

In [5]:
X = df.drop('price', axis = 1)
y = df['price']

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
# One-Hot Encoding

oh_cat_features = ['distress', 'waist_rise', 'number_of_pockets', 'stretch', 'length']
le_cat_features = ['brand', 'fit']

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

oh_transformer = OneHotEncoder(drop = 'first')


preprocessor = ColumnTransformer(
    transformers=[
        ("OneHotEncoder", OneHotEncoder(drop='first'), oh_cat_features),
        ("LabelEncoderFit", 'passthrough', ['fit']),
        ("LabelEncoderBrand", 'passthrough', ['brand'])
    ], remainder='passthrough'
)

In [8]:
X_train = preprocessor.fit_transform(X_train)

In [9]:
X_test = preprocessor.transform(X_test)

## Random Forest Model

In [10]:
from sklearn.ensemble import RandomForestRegressor

randomForest = RandomForestRegressor()
randomForest.fit(X_train, y_train)

In [11]:
y_pred = randomForest.predict(X_test)

In [12]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

print("r2", r2_score(y_test, y_pred))
print("mean_absolute_error", mean_absolute_error(y_test, y_pred))
print("mean_squared_error", mean_squared_error(y_test, y_pred))

r2 0.7004789982964468
mean_absolute_error 214.28323451523002
mean_squared_error 233249.9056577142


## Hypertunned Random Forest

In [13]:
from sklearn.ensemble import RandomForestRegressor

randomForest = RandomForestRegressor(n_estimators=100, min_samples_split=2, max_features=None, max_depth=100, criterion='poisson')
randomForest.fit(X_train, y_train)

In [14]:
y_pred_hypertunned = randomForest.predict(X_test)

In [15]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

print("r2", r2_score(y_test, y_pred_hypertunned))
print("mean_absolute_error", mean_absolute_error(y_test, y_pred_hypertunned))
print("mean_squared_error", mean_squared_error(y_test, y_pred_hypertunned))


r2 0.7300437489898585
mean_absolute_error 210.35207719140755
mean_squared_error 210226.56081441237


In [18]:
# Exporting the model
import pickle

pickle.dump(randomForest, open('randomForest.pkl', 'wb'))
pickle.dump(le1,  open('LabelEncoderFit.pkl', 'wb'))
pickle.dump(le2,  open('LabelEncoderBrand.pkl', 'wb'))
pickle.dump(preprocessor,  open('preprocessor.pkl', 'wb'))


## Adaboost

In [19]:
from sklearn.ensemble import AdaBoostRegressor

adaBoost = AdaBoostRegressor()

In [20]:
adaBoost.fit(X_train, y_train)

In [21]:
y_pred_adaboost = adaBoost.predict(X_test)

In [22]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

print("r2", r2_score(y_test, y_pred_adaboost))
print("mean_absolute_error", mean_absolute_error(y_test, y_pred_adaboost))
print("mean_squared_error", mean_squared_error(y_test, y_pred_adaboost))


r2 0.23281371712962773
mean_absolute_error 606.1247279719837
mean_squared_error 597441.0044158314


## Gradient Boosting

In [23]:
from sklearn.ensemble import GradientBoostingRegressor

gradient = GradientBoostingRegressor()

In [24]:
gradient.fit(X_train, y_train)

In [25]:
y_pred_gradient = gradient.predict(X_test)

In [26]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

print("r2", r2_score(y_test, y_pred_gradient))
print("mean_absolute_error", mean_absolute_error(y_test, y_pred_gradient))
print("mean_squared_error", mean_squared_error(y_test, y_pred_gradient))


r2 0.5277706979473425
mean_absolute_error 315.0807317975113
mean_squared_error 367745.29841352324


## Hyperparameter Tunning

In [35]:
gradient_params = {
            "max_depth": [100, 110, 90, 115, 95],
            "loss": ['squared_error', 'absolute_error', 'huber', 'quantile'],
            "learning_rate" : [0.01, 0.1, 0.5, 0.9],
            "subsample" : [0.4, 0.7, 0.8 ,0.9, 1],
            "min_samples_split": [2, 3, 4, 5],
            "n_estimators": [100, 110, 90, 95],
            "criterion" : ["squared_error", "friedman_mse"],
            "min_samples_leaf" : [10, 50, 100, 150, 200],
            "min_weight_fraction_leaf" : [0.01, 0.1, 0.2, 0.3, 0.5]
         }

In [45]:
from sklearn.model_selection import GridSearchCV

randomizedCV = GridSearchCV(estimator=GradientBoostingRegressor(), param_grid=gradient_params, cv=3, verbose=3, n_jobs=-1)

In [None]:
randomizedCV.fit(X_train, y_train)

Fitting 3 folds for each of 320000 candidates, totalling 960000 fits


In [38]:
y_pred_randomizedCV = randomizedCV.predict(X_test)

In [39]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

print("r2", r2_score(y_test, y_pred_randomizedCV))
print("mean_absolute_error", mean_absolute_error(y_test, y_pred_randomizedCV))
print("mean_squared_error", mean_squared_error(y_test, y_pred_randomizedCV))


r2 0.5412239537453591
mean_absolute_error 324.3871333604868
mean_squared_error 357268.66863521404


In [40]:
randomizedCV.best_params_

{'subsample': 0.4,
 'n_estimators': 110,
 'min_weight_fraction_leaf': 0.01,
 'min_samples_split': 4,
 'min_samples_leaf': 100,
 'max_depth': 95,
 'loss': 'squared_error',
 'learning_rate': 0.9,
 'criterion': 'squared_error'}