In [151]:
#loading the required packages
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import plotnine as p9 
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_transformer,make_column_selector


In [152]:
#Fetching the data
raw_data = pd.read_csv('./data/tesco-dataset/train.csv')
print('The shape of the dataset :' + str(raw_data.shape))
raw_data.head()

The shape of the dataset :(320, 16)


Unnamed: 0,location_id,crime_rate,proportion_flats,proportion_nonretail,new_store,commercial_property,household_size,proportion_newbuilds,public_transport_dist,transport_availability,property_value,school_proximity,competitor_density,household_affluency,normalised_sales,county
0,464,17.600541,0.0,18.1,no,,2.926,29.0,2.9084,All transport options,666,20.2,368.74,4.5325,-0.399933,c_40
1,504,0.603556,20.0,3.97,no,14.85,4.52,10.6,2.1398,Average transport options,264,13.0,388.37,1.815,2.216308,c_80
2,295,0.60681,0.0,6.2,no,7.7,2.981,31.9,3.6715,Many transport options,307,17.4,378.35,2.9125,0.16692,c_53
3,187,0.012385,55.0,2.25,no,1.95,3.453,68.1,7.3073,No transport options,300,15.3,394.72,2.0575,-0.083804,c_65
4,193,0.016182,100.0,1.32,no,3.05,3.816,59.5,8.3248,Average transport options,256,15.1,392.9,0.9875,0.962693,c_97


# Plan for modelling (MVP):
* Usuals - > Train_test split, target variable split, 
1. Missing values for school_proximity and commercial_property - Start with median imputation and check the model accuracy
2. Categorical encoding
    1. Binary encoding for new_store feature
    2. Ordinal encoding for transport_availability feature
    3. Group counties to high, medium, low sales
3. Outlier handling if requred - not for MVP
4. Feature selection
5. Model development 

In [153]:
# train_test split
X=raw_data.drop("normalised_sales", axis=1)
y=raw_data[["normalised_sales"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [154]:
# Ordinal transformation
order=[['No transport options','Few transport options','Average transport options','Many transport options','All transport options'        
        ]]

In [172]:

# custom transformer

class CountySalesEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, high_threshold=0.66, low_threshold=0.33):
        # Thresholds to define 'high', 'medium', 'low' sales categories
        self.high_threshold = high_threshold
        self.low_threshold = low_threshold

    def fit(self, X, y=None):
        # Assuming 'X' is a DataFrame with a 'county' column and 'y' contains sales data
        # Calculate and store the average sales for each county
        if y is None:
            raise ValueError("y cannot be None. Please provide the sales data.")
        
        # Ensure X and y have the same length
        if len(X) != len(y):
            raise ValueError("The length of X and y must be the same.")
       
        # Calculate the average sales per county
        self.county_sales_averages = y.squeeze().groupby(X['county']).mean()
        return self

    def transform(self, X, y=None):
        # Check if fit has been called
        if not hasattr(self, "county_sales_averages"):
            raise AttributeError("fit has not been called. Please call fit before transform.")
        
        # Map each county to its average sales
        X['average_sales'] = X['county'].map(self.county_sales_averages)

        # Categorize based on the thresholds
        categories = pd.cut(X['average_sales'],
                            bins=[-float('inf'), self.low_threshold, self.high_threshold, float('inf')],
                            labels=['low', 'medium', 'high'])
        X.drop("average_sales", axis=1,inplace=True)

        return pd.DataFrame(categories, index=X.index)
    
    
    def get_feature_names_out(self, input_features=None):
        return np.array(['county'])

X_train['average_sales'] = X_train['county'].map(county_sales_averages)

In [167]:
county_encoder=CountySalesEncoder()
county_encoder.fit_transform(X_train,y_train)

Unnamed: 0,average_sales
132,low
317,low
234,low
312,low
232,high
...,...
188,low
71,low
106,medium
270,low


In [174]:


numerical_columns=X_train.select_dtypes(include=['int64', 'float64']).columns.to_list()
categorical_columns= ["new_store", "transport_availability", "county"]

# Handling missing values

num_pipeline = make_pipeline(
SimpleImputer(strategy="median")
)

ordinal_pipeline= make_pipeline(
    OrdinalEncoder(categories=order)
                                )
onehot_pipeline= make_pipeline(
    OneHotEncoder(drop='if_binary')
                                )

county_encoder=CountySalesEncoder()
county_pipeline= make_pipeline(    
    CountySalesEncoder()
)



preprocessing = make_column_transformer(
(num_pipeline, numerical_columns),
(ordinal_pipeline,["transport_availability"]),
(onehot_pipeline,["new_store"]),
(county_pipeline,["county"]),
remainder='passthrough'
)

In [175]:
X_train

Unnamed: 0,location_id,crime_rate,proportion_flats,proportion_nonretail,new_store,commercial_property,household_size,proportion_newbuilds,public_transport_dist,transport_availability,property_value,school_proximity,competitor_density,household_affluency,county
132,211,0.367736,0.0,21.89,no,13.70,3.431,1.2,1.8125,Few transport options,437,,396.90,3.8475,c_62
317,473,0.907062,0.0,8.14,no,,2.456,63.4,3.7965,Few transport options,307,21.0,288.99,2.9225,c_36
234,40,1.414523,0.0,8.14,no,9.40,2.570,1.9,3.7979,Few transport options,307,21.0,376.57,5.2550,c_41
312,361,10.552946,0.0,18.10,no,16.45,3.380,4.4,1.9682,All transport options,666,20.2,60.72,6.0200,c_50
232,238,0.431818,0.0,6.20,no,7.70,5.040,13.5,3.2157,Many transport options,307,17.4,387.38,0.7825,c_68
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,89,0.183184,20.0,6.96,no,5.70,3.240,83.7,4.4290,No transport options,223,,396.90,1.6475,c_54
71,506,0.168732,25.0,5.13,no,5.15,2.741,33.8,7.2254,Many transport options,284,19.7,395.11,3.2875,c_51
106,463,0.144154,30.0,4.93,no,3.90,3.393,92.2,7.0355,Many transport options,300,16.6,374.71,1.2975,c_84
270,444,51.693093,0.0,18.10,no,,1.519,0.0,1.6582,All transport options,666,20.2,88.27,9.2450,c_25


In [176]:
X_train_prep=preprocessing.fit_transform(X_train,y_train)

In [177]:
preprocessing.get_feature_names_out()

array(['pipeline-1__location_id', 'pipeline-1__crime_rate',
       'pipeline-1__proportion_flats', 'pipeline-1__proportion_nonretail',
       'pipeline-1__commercial_property', 'pipeline-1__household_size',
       'pipeline-1__proportion_newbuilds',
       'pipeline-1__public_transport_dist', 'pipeline-1__property_value',
       'pipeline-1__school_proximity', 'pipeline-1__competitor_density',
       'pipeline-1__household_affluency',
       'pipeline-2__transport_availability', 'pipeline-3__new_store_yes',
       'pipeline-4__county'], dtype=object)

In [178]:
X_train_prep=pd.DataFrame(X_train_prep,columns=preprocessing.get_feature_names_out())

In [179]:
X_train_prep

Unnamed: 0,pipeline-1__location_id,pipeline-1__crime_rate,pipeline-1__proportion_flats,pipeline-1__proportion_nonretail,pipeline-1__commercial_property,pipeline-1__household_size,pipeline-1__proportion_newbuilds,pipeline-1__public_transport_dist,pipeline-1__property_value,pipeline-1__school_proximity,pipeline-1__competitor_density,pipeline-1__household_affluency,pipeline-2__transport_availability,pipeline-3__new_store_yes,pipeline-4__county
0,211.0,0.367736,0.0,21.89,13.7,3.431,1.2,1.8125,437.0,19.0,396.9,3.8475,1.0,0.0,low
1,473.0,0.907062,0.0,8.14,9.4,2.456,63.4,3.7965,307.0,21.0,288.99,2.9225,1.0,0.0,low
2,40.0,1.414523,0.0,8.14,9.4,2.57,1.9,3.7979,307.0,21.0,376.57,5.255,1.0,0.0,low
3,361.0,10.552946,0.0,18.1,16.45,3.38,4.4,1.9682,666.0,20.2,60.72,6.02,4.0,0.0,low
4,238.0,0.431818,0.0,6.2,7.7,5.04,13.5,3.2157,307.0,17.4,387.38,0.7825,3.0,0.0,high
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251,89.0,0.183184,20.0,6.96,5.7,3.24,83.7,4.429,223.0,19.0,396.9,1.6475,0.0,0.0,low
252,506.0,0.168732,25.0,5.13,5.15,2.741,33.8,7.2254,284.0,19.7,395.11,3.2875,3.0,0.0,low
253,463.0,0.144154,30.0,4.93,3.9,3.393,92.2,7.0355,300.0,16.6,374.71,1.2975,3.0,0.0,medium
254,444.0,51.693093,0.0,18.1,9.4,1.519,0.0,1.6582,666.0,20.2,88.27,9.245,4.0,0.0,low


In [137]:
len(X_train_prep.columns)

16

In [138]:
X_train_prep.columns

Index(['pipeline-1__location_id', 'pipeline-1__crime_rate',
       'pipeline-1__proportion_flats', 'pipeline-1__proportion_nonretail',
       'pipeline-1__commercial_property', 'pipeline-1__household_size',
       'pipeline-1__proportion_newbuilds', 'pipeline-1__public_transport_dist',
       'pipeline-1__property_value', 'pipeline-1__school_proximity',
       'pipeline-1__competitor_density', 'pipeline-1__household_affluency',
       'pipeline-1__average_sales', 'pipeline-2__transport_availability',
       'pipeline-3__new_store_yes', 'remainder__county'],
      dtype='object')