Author: Ahmed Sobhi

Creation_date: 16th June 2023

Objective: Data Modeling.

## Importing Required Libararies and packages

In [218]:
import warnings
warnings.filterwarnings('ignore')

import os

import pandas as pd
import numpy as np
# Used for visulization
import matplotlib.pyplot as plt
import seaborn as sns

# For iteration visulization purpose
from tqdm import tqdm

from scipy import stats
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor

# To save the model locally
import joblib

## Required Functions

In [200]:
class SkewnessTransformer(BaseEstimator, TransformerMixin):
    """
        BaseEstimator: 
            - This is a base class for all estimators in scikit-learn.
            - It provides default implementations for the get_params() and set_params() methods.

        TransformerMixin: 
            - This is a mixin class for all transformers in scikit-learn.
            - It provides default implementations for the fit_transform() and fit() methods.

        The SkewnessTransformer class itself has the following parameters:
            - method_dict (default: None): 
                - This parameter is a dictionary that maps the skewness reduction method to a list of feature names or indices. 
                - The keys of the dictionary represent the skewness reduction methods (e.g., 'log', 'sqrt', 'boxcox'), and the values are lists of feature names or indices that should undergo the corresponding transformation.

        The SkewnessTransformer class has the following methods:
            - fit(X, y=None): 
                - This method is called to fit the transformer on the input data X. It calculates and stores the necessary parameters for each skewness reduction method specified in method_dict.

            - transform(X): 
                - This method applies the skewness reduction transformation to the input data X based on the parameters learned during the fitting process. It returns the transformed feature matrix.

            - _get_feature_indices(X, feature_names): 
                - This is a helper method that takes the feature matrix X and a list of feature names as input.
                - It returns the corresponding feature indices based on the feature names. 
                - This method is used internally in the fit() and transform() methods to handle feature names and indices correctly.
    """

    def __init__(self, skew_limit=0.8):
        self.skew_limit = skew_limit
        self.method_dict = {}
        self.lambda_dict = {}

    def fit(self, X, y=None):
        self.method_dict = self.extracrt_recommeneded_features(X)

        for method, features in self.method_dict.items():
            if method == 'boxcox':
                _, self.lambda_dict[method] = stats.boxcox(
                    X.iloc[:, self._get_feature_indices(X, features)].values.ravel())

            elif method == 'yeojohnson':
                _, self.lambda_dict[method] = stats.yeojohnson(
                    X.iloc[:, self._get_feature_indices(X, features)].values.ravel())

        return self

    def transform(self, X):
        X_transformed = X.copy()
        for method, features in self.method_dict.items():
            indices = self._get_feature_indices(X, features)
            if method == 'log':
                # Apply log transformation to the specified features
                X_transformed.iloc[:, indices] = np.log1p(X_transformed.iloc[:, indices])
            elif method == 'sqrt':
                # Apply square root transformation to the specified features
                X_transformed.iloc[:, indices] = np.sqrt(X_transformed.iloc[:, indices])
            elif method == 'boxcox':
                # Apply Box-Cox transformation to the specified features
                X_transformed.iloc[:, indices] = stats.boxcox(X_transformed.iloc[:, indices], lmbda=self.lambda_dict[method])
            elif method == 'yeojohnson':
                X_transformed.iloc[:, indices] = stats.yeojohnson(X_transformed.iloc[:, indices], lmbda=self.lambda_dict[method])
            elif method == 'cube':
                # Apply Cube transformation to the specified features
                X_transformed.iloc[:, indices] = np.cbrt(X_transformed.iloc[:, indices])
        return X_transformed

    def extracrt_recommeneded_features(self, X):

        # Select only columns with high skewness
        skew_col_lst = (
            X.skew()
            [X.skew().abs().sort_values(ascending=False) > self.skew_limit]
            .index.tolist()
        )

        methods_lst = []

        for feature in skew_col_lst:

            # Extract recommend fix method
            method = self.recommend_skewness_reduction_method(X[feature])

            methods_lst.append(method)

        # Convert to dictionary, where method is key, and values are corresponding features.
        method_dict = {}
        for index, method in enumerate(methods_lst):
            # Check if this is new method, then create a new one.
            if method not in method_dict:
                method_dict[method] = [skew_col_lst[index]]
            # Else Append to current features.
            else:
                method_dict[method].append(skew_col_lst[index])

        return method_dict

    def recommend_skewness_reduction_method(self, feature: pd.Series) -> str:
        """
            Returns a recommended skewness fix method for input feature.

            Input:
              feature: pd.Series, represent input feature data series, which we want to get recommend transformation method for it.

            Output:
              String, represent recomment transformation method for input feature.
        """

        # Create a dict to store all values of skewness
        skewness_dict = {}

        # Apply logarithmic transformation
        transformed_log = np.log1p(feature)
        skewness_log = stats.skew(transformed_log)
        # Add this method skewness
        skewness_dict['log'] = skewness_log

        # Apply square root transformation
        transformed_sqrt = np.sqrt(feature)
        skewness_sqrt = stats.skew(transformed_sqrt)
        # Add this method skewness
        skewness_dict['sqrt'] = skewness_sqrt

        # Check if data contains negative values, then don't use
        if (feature.values < 0).any() or (feature.values == 0).any():
            # Apply yeojohnson transformation
            transformed_yeojohnson, lambda_ = stats.yeojohnson(feature)
            skewness_yeojohnson = stats.skew(transformed_yeojohnson)
            # Add this method skewness
            skewness_dict['yeojohnson'] = skewness_yeojohnson

        else:
            # Apply Box-Cox transformation
            transformed_boxcox, lambda_ = stats.boxcox(feature)
            skewness_boxcox = stats.skew(transformed_boxcox)
            # Add this method skewness
            skewness_dict['Box-Cox'] = skewness_boxcox

        # Apply cube root transformation
        transformed_cbrt = np.cbrt(feature)
        skewness_cbrt = stats.skew(transformed_cbrt)
        # Add this method skewness
        skewness_dict['cube'] = skewness_cbrt

        # Extract the optimal method where if skewness if smallest
        return min(skewness_dict, key=lambda y: abs(skewness_dict[y]))

    def _get_feature_indices(self, X, feature_names):
        # Helper method to retrieve the indices of the specified feature names
        if isinstance(X, pd.DataFrame):
            return [X.columns.get_loc(feature_name) for feature_name in feature_names if feature_name in X.columns]
        elif isinstance(X, np.ndarray):
            return [feature_idx for feature_idx, feature_name in enumerate(X[0]) if feature_name in feature_names]
        else:
            raise ValueError("Unsupported input type. Expected Pandas DataFrame or NumPy array.")

In [201]:
# Load orders dataset
df = (
    pd.read_csv("../data/intermid/DS-task-data-processed.csv")
    .assign(
        priced_at=lambda x: pd.to_datetime(x.priced_at)
    )
)

# Load extra_features dataset
df_extrafeatures = pd.read_csv('../data/intermid/features_ohe.csv')

df.head(1)

Unnamed: 0,id,make,model,model_year,kilometers,transmission_type,price,priced_at,mileage_category,extra_features_count,model_age,price_moving_avg,competitor_price_diff,popularity,price_change_pct,dt_year,dt_month,dt_day,dt_quarter
0,17786,Nissan,Juke,2008,200000.0,Automatic,115000,2022-02-11,200k+,26,14,115000,-500.0,0.31,bfill,2022,2,11,1


In [202]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,18560.0,10025.87,5797.828096,0.0,5017.75,10021.5,15013.25,20097.0
model_year,18560.0,2016.373,4.330355,1999.0,2014.0,2017.0,2020.0,2023.0
kilometers,18560.0,94310.14,59968.258248,0.0,41675.0,90000.0,139999.0,285000.0
price,18560.0,274881.2,128976.204012,1000.0,181000.0,248000.0,338000.0,1384000.0
extra_features_count,18560.0,12.45323,7.795204,1.0,6.0,9.0,18.0,39.0
model_age,18560.0,5.825431,4.277615,-1.0,3.0,5.0,8.0,24.0
price_moving_avg,18560.0,274529.9,124790.002278,67800.0,183000.0,247400.0,335600.0,1240800.0
competitor_price_diff,18560.0,1.204296e-12,67340.7765,-425791.044776,-42011.827957,-9312.807882,39232.583065,728687.2
popularity,18560.0,3.698048,1.949262,0.0,1.1,5.0,5.0,5.0
dt_year,18560.0,2022.198,0.398712,2022.0,2022.0,2022.0,2022.0,2023.0


In [203]:
# Merge with extra_features 
df_set = df.merge(
    df_extrafeatures.drop(['extra_features_lst', 'extra_features_count'], axis=1),
    how='left',
    on='id'
)

# Remove unnessary columns
df_set = df_set.drop(['id', 'make', 'kilometers'], axis=1)

df_set.head()

Unnamed: 0,model,model_year,transmission_type,price,priced_at,mileage_category,extra_features_count,model_age,price_moving_avg,competitor_price_diff,...,sensors,sideairbag,startengine,steptronic,sunroof,tintedglass,touchactivateddoorlock,touchscreen,tractioncontrol,usb
0,Juke,2008,Automatic,115000,2022-02-11,200k+,26,14,115000,-500.0,...,1,0,0,0,1,0,0,1,0,1
1,Juke,2008,Automatic,116000,2022-04-21,200k+,26,14,115000,500.0,...,1,0,0,0,1,0,0,1,0,1
2,Juke,2009,Automatic,239000,2022-06-01,0-50k,26,13,115500,0.0,...,1,0,0,0,1,0,0,1,0,1
3,Juke,2010,Automatic,195000,2022-02-07,100k-150k,26,12,156666,0.0,...,1,0,0,0,1,0,0,1,0,1
4,Juke,2011,Automatic,248000,2022-06-29,0-50k,7,11,166250,0.0,...,0,0,0,0,0,0,0,0,0,0


## Train validation split

In [204]:
target_col = 'price'

# Select features only
df_features = df_set.drop(target_col, axis=1)

df_y = df_set[target_col]

x_train, x_val, y_train, y_val = train_test_split(
    df_features,
    df_y,
    test_size=0.2,
    random_state=42,
    stratify=df_features['model']
)

## Pipeline Steps

In [101]:
# # Extract numerical|Categorical features
# df_uniques = (
#     pd.DataFrame(
#         [[i, len(df_extrafeatures[i].unique())] for i in df_extrafeatures.columns],
#           columns=['Variable', 'Unique Values']).set_index('Variable')
# )
# ohe_ecnoded_features = list(df_uniques[df_uniques['Unique Values'] == 2].index)

# numeric_features = list(set(df_features.select_dtypes('number').columns.tolist()) - set(ohe_ecnoded_features))

# categorical_features = df_features.select_dtypes('object').columns.tolist()

# df_features[numeric_features].replace([np.inf, -np.inf], np.nan, inplace=True)

# imputer_numeric = SimpleImputer(strategy='mean')
# imputer_categorical = SimpleImputer(strategy='most_frequent')

# # Create the pipeline for skewness_transformer
# skewness_transformer = Pipeline([
#     ('skewness_transformer', SkewnessTransformer()),
# ])

# numeric_transformer = Pipeline(
#     [
#         #('impute', imputer_numeric),
#         ('skewnees_remove', skewness_transformer),
#         ('scaler', StandardScaler())
#     ]
# )

# ohe_encoded_transformer = Pipeline(
#     [
#         ('impute', imputer_numeric)
#     ]
# )

# categorical_transformer = Pipeline(
#     [
#         ('impute', imputer_categorical),
#         ('ohc', OneHotEncoder())
#     ]
# )

# preprocessor = ColumnTransformer(
#     [
#         ('numericals', numeric_transformer, numeric_features),
#         ('ohe_numericals', ohe_encoded_transformer, ohe_ecnoded_features),
#         ('Categorical', categorical_transformer, categorical_features)
#     ],
#     remainder='drop',
#     n_jobs=-1
# )

In [205]:
#Extract numerical|Categorical features
numeric_features = df_features.select_dtypes('number').columns.tolist()

categorical_features = df_features.select_dtypes('object').columns.tolist()

imputer_numeric = SimpleImputer(strategy='mean')
imputer_categorical = SimpleImputer(strategy='most_frequent')

# Create the pipeline for skewness_transformer
skewness_transformer = Pipeline([
    ('skewness_transformer', SkewnessTransformer()),
])

numeric_transformer = Pipeline(
    [
        ('impute', imputer_numeric),
        ('scaler', StandardScaler())
    ]
)

categorical_transformer = Pipeline(
    [
        ('impute', imputer_categorical),
        ('ohc', OneHotEncoder(handle_unknown='ignore'))
    ]
)

preprocessor = ColumnTransformer(
    [
        ('numericals', numeric_transformer, numeric_features),
        ('Categorical', categorical_transformer, categorical_features)
    ],
    remainder='drop',
    n_jobs=-1
)

In [206]:
# # Testing
preprocessor.fit(df_features)

preprocessor.transform(df_features)

<18560x11891 sparse matrix of type '<class 'numpy.float64'>'
	with 1336320 stored elements in Compressed Sparse Row format>

## Model 

In [207]:
initial_lr = 0.05

model = XGBRegressor(objective='reg:squarederror',
                     colsample_bytree=0.5,
                     learning_rate=initial_lr,
                     max_depth=6,
                     min_child_weight=1,
                     n_estimators=1000,
                     subsample=0.99,
                     verbosity=1,
                     n_jobs=-1)

model_pipeline = Pipeline(
    steps=[
        ('col_trans', preprocessor),
        ('model', model)
    ]
)

In [208]:
model_pipeline.fit(x_train, y_train)

In [209]:
pred_train = model_pipeline.predict(x_train).round()

pred_val = model_pipeline.predict(x_val).round()

In [210]:
rmse_train = mean_squared_error(y_train, pred_train, squared=False)

rmse_val = mean_squared_error(y_val, pred_val, squared=False)

print(f"RMSE:\n\tTrain Set = {rmse_train}\n\tVal Set = {rmse_val}")

RMSE:
	Train Set = 2162.016706088487
	Val Set = 8828.114476420338


In [219]:
# Save the model
if not os.path.exists('../src/model/'):
        os.makedirs('../src/model/')

joblib.dump(model_pipeline, f'../src/model/xgboost_model.pkl')

['../src/model/xgboost_model.pkl']

In [220]:
# Load the mode
model_load = joblib.load(f'../src/model/xgboost_model.pkl')

## Feature Importance

In [221]:
# Get features importances
features_importances = model_load.named_steps['model'].feature_importances_

# Get the ColumnTransformer step
column_transformer = model_load.named_steps['col_trans']

# Get the numerical feature names
numerical_features = column_transformer.transformers_[0][2]

# Get the one-hot encoded feature names
one_hot_encoder = column_transformer.named_transformers_['Categorical'].named_steps['ohc']
categorical_features = one_hot_encoder.get_feature_names_out(column_transformer.transformers_[1][2])

# Combine feature names and importances
feature_names = np.concatenate((numerical_features, categorical_features))
importances = np.concatenate((features_importances[:len(numerical_features)], features_importances[len(numerical_features):]))

importances, feature_names = zip(*sorted(zip(importances, feature_names)))

# Plot the feature importances
plt.figure(figsize=(15, 15))
plt.barh(feature_names, importances, height=0.2)
plt.ylabel('Features')
plt.xlabel('Importance')
plt.title('Feature Importances')
plt.xticks(rotation=90)
# plt.savefig('../report/plots/feature_importance_v0.png', bbox_inches='tight')
plt.show()

KeyboardInterrupt: 

In [None]:
df_features_import = pd.DataFrame({'feature':feature_names, 'importance':importances})
df_features_import = df_features_import.sort_values('importance', ascending=False)
df_features_import

In [None]:
df_features_import.to_csv(f'../data/output/feature_importance.csv', index=False)