# Import library

In [33]:
import pandas as pd
import yaml

# Variable config

## Read yaml file 

In [34]:
with open('../config.yaml', 'r') as file:
    config = yaml.safe_load(file)

In [35]:
data_path = config['DATA_PATH'] ## '../data/dress_rental_prices news.csv'

# Read data

In [36]:
df = pd.read_csv(data_path)

In [37]:
df.head()

Unnamed: 0,numbers,ID,Name,Brand,Colour,Catagories,Price
0,0,74416,Runway stripe dress,Stella McCartney,beige,dresses,111
1,1,73815,Reformation Kourtney Dress,Reformation,beige,dresses,50
2,2,73801,Ivory Viola bridal dress,Ghost,beige,dresses,75
3,3,73718,Pasu Dress - Rhino Tusk,Coucoo,beige,dresses,37
4,4,73605,Ellen,RIXO,beige,dresses,47


# Explore data

## data type

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29997 entries, 0 to 29996
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   numbers     29997 non-null  int64 
 1   ID          29997 non-null  int64 
 2   Name        29995 non-null  object
 3   Brand       29997 non-null  object
 4   Colour      29997 non-null  object
 5   Catagories  29997 non-null  object
 6   Price       29997 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 1.6+ MB


## describe

In [39]:
df.describe(percentiles=[.1, .90, .95, .99])

Unnamed: 0,numbers,ID,Price
count,29997.0,29997.0,29997.0
mean,14998.0,42548.3364,61.6688
std,8659.532349,19867.282746,38.138688
min,0.0,122.0,11.0
10%,2999.6,14777.6,32.0
50%,14998.0,42902.0,51.0
90%,26996.4,68848.4,102.0
95%,28496.2,71589.2,139.0
99%,29696.04,73993.12,195.0
max,29996.0,75024.0,793.0


## ydata-profiling

In [40]:
## let see in exlore_data.ipynb

## nan value

In [41]:
df.isna().sum()

numbers       0
ID            0
Name          2
Brand         0
Colour        0
Catagories    0
Price         0
dtype: int64

# Transform data

## drop nan value

In [42]:
df = df[~df['Name'].isna()]

## drop numbers column

In [43]:
df.drop('numbers', axis= 1 , inplace = True)

## change data type

In [44]:
df['ID'] = df['ID'].astype('object')
df['Brand'] = df['Brand'].astype('category')
df['Colour'] = df['Colour'].astype('category')

## split categories and replace nan with no-sub-category

In [45]:
sub_categories = df['Catagories'].str.split(',', expand=True)
sub_categories = sub_categories.fillna('no-sub-categories')
sub_categories.columns = [f'sub-Catagories-{i+1}' for i in range(sub_categories.shape[1])]
for sub in sub_categories.columns:
    sub_categories[sub] = sub_categories[sub].astype('category')
df = pd.concat([df, sub_categories], axis=1)
df = df.drop(columns=['Catagories'])

## filter data with less than 400

In [46]:
df = df[df['Price'] < 400]

# Feature engineering

## Calculate average price by brand and add as a new column

In [47]:
df['Avg_Price_By_Brand'] = df.groupby('Brand')['Price'].transform('mean')

  df['Avg_Price_By_Brand'] = df.groupby('Brand')['Price'].transform('mean')


## Calculate average price by (brand, colour) and add as a new column

In [48]:
df['Avg_Price_By_Brand_Colour'] = df.groupby(['Brand', 'Colour'])['Price'].transform('mean')

  df['Avg_Price_By_Brand_Colour'] = df.groupby(['Brand', 'Colour'])['Price'].transform('mean')


## Create a new column to count the number of sub-categories that are not 'no-sub-categories'

In [49]:
df['Category_Count'] = (sub_categories != 'no-sub-categories').sum(axis=1)

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 29974 entries, 0 to 29996
Data columns (total 34 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   ID                         29974 non-null  object  
 1   Name                       29974 non-null  object  
 2   Brand                      29974 non-null  category
 3   Colour                     29974 non-null  category
 4   Price                      29974 non-null  int64   
 5   sub-Catagories-1           29974 non-null  category
 6   sub-Catagories-2           29974 non-null  category
 7   sub-Catagories-3           29974 non-null  category
 8   sub-Catagories-4           29974 non-null  category
 9   sub-Catagories-5           29974 non-null  category
 10  sub-Catagories-6           29974 non-null  category
 11  sub-Catagories-7           29974 non-null  category
 12  sub-Catagories-8           29974 non-null  category
 13  sub-Catagories-9           29974 non

# Split data to train, validation, test and feature extraction

In [51]:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.feature_extraction import DictVectorizer

In [52]:
def split_dataframe(data, target_column, train_size=0.7, validation_size=0.2, test_size=0.1, stratify=None):
    """
    Splits a DataFrame into train, validation, and test sets, and returns features and target as NumPy arrays.
    
    Parameters:
    data (DataFrame): The input DataFrame to be split.
    target_column (str): The name of the target column.
    train_size (float): Proportion of the dataset to include in the train split (0 to 1).
    validation_size (float): Proportion of the dataset to include in the validation split (0 to 1).
    test_size (float): Proportion of the dataset to include in the test split (0 to 1).
    stratify (str or None): Column to be used for stratification. Default is None.
    
    Returns:
    x_train (ndarray): Training set features.
    y_train (ndarray): Training set target.
    x_validation (ndarray): Validation set features.
    y_validation (ndarray): Validation set target.
    x_test (ndarray): Test set features.
    y_test (ndarray): Test set target.

    Example:
    >>> import pandas as pd
    >>> data = {'ID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                'Feature1': [0.1, 0.2, 0.2, 0.4, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
                'Feature2': [1.1, 1.2, 1.2, 1.4, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9],
                'Target': [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]}
    >>> df = pd.DataFrame(data)
    >>> x_train, y_train, x_validation, y_validation, x_test, y_test = split_dataframe(df, target_column='Target', stratify='Target')
    >>> print(x_train.shape, y_train.shape)
    >>> print(x_validation.shape, y_validation.shape)
    >>> print(x_test.shape, y_test.shape)
    """
    
    # Stratify parameter for the split (can be None)
    stratify_param = None
    if stratify is not None:
        stratify_param = data[stratify]
    
    # Convert DataFrame to a dictionary of records
    data_dict = data.drop(columns=[target_column, 'ID', 'Name']).to_dict(orient="records")
    
    # Vectorize the dictionary of records
    vec = DictVectorizer(sparse=False)
    data_features = vec.fit_transform(data_dict)
    
    # Split the feature matrix and target array into train+validation and test sets
    train_validation_features, test_features, train_validation_target, test_target = train_test_split(
        data_features, data[target_column].values, test_size=test_size, random_state=42, stratify=stratify_param
    )
    
    if validation_size == 0:
        x_train = train_validation_features
        y_train = train_validation_target
        x_validation, y_validation = None, None
    else:
        # Adjust validation size to account for the test set already being removed
        adjusted_validation_size = validation_size / (1 - test_size)
        
        # Split the remaining data into train and validation sets
        x_train, x_validation, y_train, y_validation = train_test_split(
            train_validation_features, train_validation_target, test_size=adjusted_validation_size, random_state=42, stratify=stratify_param
        )
    
    x_test = test_features
    y_test = test_target
    
    return x_train, y_train, x_validation, y_validation, x_test, y_test

In [53]:
x_train, y_train, x_validation, y_validation, x_test, y_test = split_dataframe(df, target_column='Price')

# Model

In [54]:
import mlflow

In [55]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("fashion-rental-prediction")

<Experiment: artifact_location='file:///c:/Users/Rattapon.San/Fashion-rental-prediction-with-mlops/notebook/mlruns/1', creation_time=1721804795289, experiment_id='1', last_update_time=1721804795289, lifecycle_stage='active', name='fashion-rental-prediction', tags={}>

In [56]:
from sklearn.metrics import mean_squared_error, root_mean_squared_error

In [57]:
from sklearn.feature_extraction import DictVectorizer

In [58]:
from sklearn.linear_model import Lasso, BayesianRidge, SGDRegressor

In [59]:
from tqdm import tqdm

In [60]:
lasso = Lasso(alpha=0.1)
bayesian = BayesianRidge()
sgd = SGDRegressor()

In [61]:
# # enable autologging
# for model in tqdm((lasso, bayesian, sgd), desc = 'Training with sklearn model'):
#     mlflow.sklearn.autolog()
#     with mlflow.start_run() as run:
#         model.fit(x_train, y_train)


In [65]:
from hpsklearn import HyperoptEstimator, any_regressor
from hyperopt import tpe


In [68]:
estim = HyperoptEstimator(regressor=any_regressor(name='test'), trial_timeout=120)

In [None]:
## ทำใหม่

In [69]:
estim.fit(x_train, y_train)

100%|██████████| 1/1 [02:10<00:00, 130.57s/trial, best loss=?]


AllTrialsFailed: 

In [None]:
print(estim.best_model())