In [1]:
#loading the required packages
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import plotnine as p9 
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_transformer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from scipy.stats.mstats import winsorize
from sklearn.preprocessing import FunctionTransformer 
from sklearn.preprocessing import StandardScaler
from scipy.stats import mstats
from sklearn.model_selection import GridSearchCV


In [2]:
#Fetching the data
raw_data = pd.read_csv('./data/tesco-dataset/train.csv')
print('The shape of the dataset :' + str(raw_data.shape))
raw_data.head()

The shape of the dataset :(320, 16)


Unnamed: 0,location_id,crime_rate,proportion_flats,proportion_nonretail,new_store,commercial_property,household_size,proportion_newbuilds,public_transport_dist,transport_availability,property_value,school_proximity,competitor_density,household_affluency,normalised_sales,county
0,464,17.600541,0.0,18.1,no,,2.926,29.0,2.9084,All transport options,666,20.2,368.74,4.5325,-0.399933,c_40
1,504,0.603556,20.0,3.97,no,14.85,4.52,10.6,2.1398,Average transport options,264,13.0,388.37,1.815,2.216308,c_80
2,295,0.60681,0.0,6.2,no,7.7,2.981,31.9,3.6715,Many transport options,307,17.4,378.35,2.9125,0.16692,c_53
3,187,0.012385,55.0,2.25,no,1.95,3.453,68.1,7.3073,No transport options,300,15.3,394.72,2.0575,-0.083804,c_65
4,193,0.016182,100.0,1.32,no,3.05,3.816,59.5,8.3248,Average transport options,256,15.1,392.9,0.9875,0.962693,c_97


In [3]:
# train_test split
X=raw_data.drop("normalised_sales", axis=1)
y=raw_data[["normalised_sales"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Ordinal transformation
order=[['No transport options','Few transport options','Average transport options','Many transport options','All transport options'        
        ]]

In [5]:

# custom transformer

class CountySalesEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, high_threshold=0.66, low_threshold=0.33):
        
        self.high_threshold = high_threshold
        self.low_threshold = low_threshold

    def fit(self, X, y=None):
        
        # Calculate and store the average sales for each county
        if y is None:
            raise ValueError("y cannot be None. Please provide the sales data.")
        
        # Ensure X and y have the same length
        if len(X) != len(y):
            raise ValueError("The length of X and y must be the same.")
       
        
        self.county_sales_averages = y.squeeze().groupby(X['county']).mean()
        return self

    def transform(self, X, y=None):
        # Check if fit has been called
        if not hasattr(self, "county_sales_averages"):
            raise AttributeError("fit has not been called. Please call fit before transform.")
        
        
        X['average_sales'] = X['county'].map(self.county_sales_averages)

        
        categories = pd.cut(X['average_sales'],
                            bins=[-float('inf'), self.low_threshold, self.high_threshold, float('inf')],
                            labels=[1.0, 2.0, 3.0])
        X.drop("average_sales", axis=1,inplace=True)

        return pd.DataFrame(categories, index=X.index)
    
    
    def get_feature_names_out(self, input_features=None):
        return np.array(['county'])

In [6]:
def winsorize_dataframe(df, limits=[0.05, 0.05]):
    """
    Apply Winsorization to each column of a pandas DataFrame or Series.
    :param df: Input DataFrame or Series.
    :param limits: Tuple of lower and upper percentiles to clip the data.
    :return: Winsorized DataFrame or Series.
    """
    for col in df.columns:
        df[col] = winsorize(df[col], limits=[0.05, 0.05])
    return(df)


In [7]:


numerical_columns=X_train.select_dtypes(include=['int64', 'float64']).columns.to_list()
numerical_columns.remove('location_id')

categorical_columns= ["new_store", "transport_availability", "county"]

# Handling missing values

num_pipeline = make_pipeline(
SimpleImputer(strategy="median")
)

ordinal_pipeline= make_pipeline(
    OrdinalEncoder(categories=order)
                                )
onehot_pipeline= make_pipeline(
    OneHotEncoder(drop='if_binary')
                                )
log_pipeline = make_pipeline(  
                             FunctionTransformer(np.log, feature_names_out="one-to-one"), 

                             )
winsorize_pipeline = make_pipeline(
    FunctionTransformer(winsorize_dataframe, kw_args={'limits': [0.05, 0.05]},feature_names_out="one-to-one"),
)

county_encoder=CountySalesEncoder()
county_pipeline= make_pipeline(    
    CountySalesEncoder()
)



preprocessing = make_column_transformer(
(num_pipeline, numerical_columns),
(ordinal_pipeline,["transport_availability"]),
(log_pipeline, ['proportion_nonretail','crime_rate','property_value','school_proximity','competitor_density']),
(winsorize_pipeline,['proportion_nonretail','crime_rate','property_value','school_proximity','competitor_density']),
(onehot_pipeline,['new_store']),
(county_pipeline,['county']),
remainder='drop'
)

In [8]:
X_prep=preprocessing.fit_transform(X,y)
preprocessing.get_feature_names_out()
X_prep=pd.DataFrame(X_prep,columns=preprocessing.get_feature_names_out())

In [9]:
forest_reg = make_pipeline(RandomForestRegressor(random_state=42))
y_np = y.to_numpy().reshape(-1)
forest_rmses = -cross_val_score(forest_reg, X=X_prep, y=y_np, scoring="neg_root_mean_squared_error", cv=10)
pd.Series(forest_rmses).describe()

count    10.000000
mean      0.353331
std       0.116610
min       0.248787
25%       0.280293
50%       0.312058
75%       0.375536
max       0.628059
dtype: float64

In [78]:
y.describe()

Unnamed: 0,normalised_sales
count,320.0
mean,-0.016967
std,0.978561
min,-1.936974
25%,-0.58525
50%,-0.143759
75%,0.243227
max,2.968477


## Grid search CV on complete training data

In [82]:
param_grid = [
{ 'randomforestregressor__max_features': [4, 6, 8, 10],
 'randomforestregressor__n_estimators':[1500,2000],
   'randomforestregressor__max_depth':[30,35,40]
 }
]
y = y.to_numpy().reshape(-1)

grid_search = GridSearchCV(forest_reg, param_grid, cv=3,scoring='neg_root_mean_squared_error') 
grid_search.fit(X_prep, y)

In [83]:
best_params = grid_search.best_params_
display("Best parameters:", best_params)



'Best parameters:'

{'randomforestregressor__max_depth': 30,
 'randomforestregressor__max_features': 8,
 'randomforestregressor__n_estimators': 1500}

In [84]:
best_model = grid_search.best_estimator_

## Running the best model

In [85]:

forest_rmses = -cross_val_score(best_model, X=X_train_prep, y=y_train, scoring="neg_root_mean_squared_error", cv=10)
pd.Series(forest_rmses).describe()

count    10.000000
mean      0.364265
std       0.058755
min       0.290522
25%       0.318481
50%       0.364473
75%       0.392582
max       0.484009
dtype: float64

## Running the best model on test data

In [90]:
#Fetching the data
X_test = pd.read_csv('./data/tesco-dataset/test.csv')
print('The shape of the dataset :' + str(raw_data.shape))
X_test.head()

The shape of the dataset :(320, 16)


Unnamed: 0,location_id,crime_rate,proportion_flats,proportion_nonretail,new_store,commercial_property,household_size,proportion_newbuilds,public_transport_dist,transport_availability,property_value,school_proximity,competitor_density,household_affluency,county
0,105,0.039968,34.0,6.09,no,4.15,3.59,59.6,5.4917,Many transport options,329,16.1,395.75,2.375,c_42
1,400,0.587758,20.0,3.97,no,14.85,5.398,8.5,2.2885,Average transport options,264,13.0,386.86,1.4775,c_140
2,338,1.116926,0.0,8.14,no,9.4,2.813,0.0,4.0952,Few transport options,307,,394.54,4.97,c_55
3,227,1.517409,0.0,19.58,no,12.75,3.066,0.0,1.7573,Average transport options,403,14.7,353.89,1.6075,c_62
4,114,83.093533,0.0,18.1,no,16.45,2.957,0.0,1.8026,All transport options,666,20.2,16.45,5.155,c_22


In [101]:
final=preprocessing.transform(X_test)
preprocessing.get_feature_names_out()
final=pd.DataFrame(final,columns=preprocessing.get_feature_names_out())

In [110]:
X_test['predicted_sales']=best_model.predict(final)

In [111]:
(X_test
 .groupby("county")
 .aggregate({"predicted_sales":"mean"})
 .sort_values(by="predicted_sales", ascending=False)
 )

Unnamed: 0_level_0,predicted_sales
county,Unnamed: 1_level_1
c_122,2.286133
c_107,1.775777
c_140,1.469364
c_69,0.754891
c_58,0.255007
c_42,0.179136
c_62,0.129355
c_63,-0.100079
c_56,-0.171847
c_55,-0.729833
