# Preliminaries

In [446]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

In [447]:
# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

warnings.filterwarnings('ignore')

Import the data

In [448]:
path = "../Datasets/"
Train = pd.read_csv(path + '/Historical-transaction-data.csv')
StoreInfo = pd.read_csv(path + '/Store-info.csv')
Submission_testing = pd.read_csv(path + '/Testing-data.csv')

Join both datasets to form a train dataset

In [449]:
Train = Train.merge(StoreInfo, on='shop_id', how='left')

In [450]:
Submission_testing.drop("shop_profile", axis=1, inplace=True)
# Submission_testing = Submission_testing.merge(StoreInfo, on='shop_id', how='left')
# Submission_testing.drop("shop_profile", axis=1, inplace=True)
Submission_testing = Submission_testing.merge(Train, on='shop_id', how='left')

In [451]:
Submission_testing.drop("shop_profile", axis=1, inplace=True)

In [452]:
# Train.head()

In [453]:
# Submission_testing.head()

In [454]:
print("New length: ", Train.shape[0])

New length:  473974


Remove Duplicates

In [455]:
i_length = Train.shape[0]

In [456]:
Train.drop_duplicates(inplace=True)

In [457]:

print("Duplicates dropped from Train: ", i_length - Train.shape[0] )
print("New length: ", Train.shape[0])

Duplicates dropped from Train:  31211
New length:  442763


# Analysis

Missing data

In [458]:
# plt.figure(figsize=(10,8))
# cols = Train.columns
# colours = ['#000099', '#ffff00'] # specify colours: yellow - missing. blue - not missing
# sns.heatmap(Train[cols].isna(), cmap=sns.color_palette(colours))

In [459]:
# # No. of columns missing for the rows
# missing_by_row = Train.isna().sum(axis="columns")
# missing_by_row.hist(bins=100)

In [460]:
pct_missing = Train.isna().mean()
pct_missing[:15]

item_description    0.079198
transaction_date    0.000000
invoice_id          0.013030
customer_id         0.000000
shop_id             0.000000
item_price          0.000000
quantity_sold       0.000000
shop_area_sq_ft     0.000000
shop_profile        0.183565
dtype: float64

In [461]:
#Remove rows with null values for item_description or shop_profile
Train = Train.dropna(subset=['item_description', 'shop_profile'], axis=0)

In [462]:
# Train.info()

In [463]:
print("New length: ", Train.shape[0])

New length:  332581


Fill Missing Values

In [464]:
# Train.info()

Encoding

In [465]:
# Set the data types of each feature

# The nominative (unordered) categorical features
features_nom = ["item_description", "customer_id", "shop_id", "shop_profile"]

for feature in features_nom:
    Train[feature] = Train[feature].astype("category")
    if feature == "shop_profile":
        continue
    Submission_testing[feature] = Submission_testing[feature].astype("category")

Train["transaction_date"] = pd.to_datetime(Train["transaction_date"])

In [466]:
Train_y = Train["shop_profile"]
# Train_y_withShopID = Train["shop_id", "shop_profile"]
Train.drop("shop_profile", axis=1, inplace=True)

In [467]:
# Train.info()

In [468]:
Train.drop(["invoice_id", "customer_id"], axis=1, inplace=True)
Submission_testing.drop(["invoice_id", "customer_id"], axis=1, inplace=True)

In [469]:
# # creating mask
# mask = np.triu(np.ones_like(Train.corr()))
 
# # plotting a triangle correlation heatmap
# dataplot = sns.heatmap(Train.corr(), cmap="YlGnBu", annot=True, mask=mask)
 
# # displaying heatmap
# plt.show()

In [470]:
# Check for outliers

Train.kurt(numeric_only=True)

item_price         382.227475
quantity_sold      420.507197
shop_area_sq_ft      2.908254
dtype: float64

In [471]:
Train["quantity_sold"].describe()

count    332581.000000
mean          1.874626
std           1.539118
min          -1.000000
25%           1.000000
50%           2.000000
75%           2.000000
max         101.000000
Name: quantity_sold, dtype: float64

In [472]:
# TODO : further investigate the outliers

In [473]:
Train["item_price"].describe()

count    332581.000000
mean        214.825306
std         167.916289
min          35.000000
25%         110.000000
50%         220.000000
75%         220.000000
max       17400.000000
Name: item_price, dtype: float64

In [474]:
Train['timestamp_dt'] = pd.to_datetime(Train['transaction_date'], format='%Y-%m-%d')
Train['year'] = Train['timestamp_dt'].dt.year
Train['month'] = Train['timestamp_dt'].dt.month
Train['weekday'] = Train['timestamp_dt'].dt.weekday

Train.drop(['timestamp_dt', 'transaction_date'], axis=1, inplace=True)

Train.head()

Unnamed: 0,item_description,shop_id,item_price,quantity_sold,shop_area_sq_ft,year,month,weekday
0,ORANGE BARLEY 1.5L,SHOP008,220,2,678,2021,12,5
1,GINGER BEER 1.5L,SHOP112,220,2,668,2021,10,6
2,TONIC PET 500ML,SHOP008,160,2,678,2021,12,0
3,CREAM SODA 1L,SHOP008,150,2,678,2021,12,0
4,STRAWBERRY MILK 180ML,SHOP112,210,5,668,2021,10,5


In [475]:
Submission_testing['timestamp_dt'] = pd.to_datetime(Submission_testing['transaction_date'], format='%Y-%m-%d')
Submission_testing['year'] = Submission_testing['timestamp_dt'].dt.year
Submission_testing['month'] = Submission_testing['timestamp_dt'].dt.month
Submission_testing['weekday'] = Submission_testing['timestamp_dt'].dt.weekday

Submission_testing.drop(['timestamp_dt', 'transaction_date'], axis=1, inplace=True)

Submission_testing.head()

Unnamed: 0,shop_id,item_description,item_price,quantity_sold,shop_area_sq_ft,year,month,weekday
0,SHOP046,FIT O MIXED FRUIT 1L,270,1,545,2021,10,4
1,SHOP046,FIT O ORANGE 1L,290,1,545,2021,10,2
2,SHOP046,LEMONADE 1.5L,220,2,545,2021,10,3
3,SHOP046,FIT O MANGO 200ML,180,4,545,2021,11,2
4,SHOP046,FIT O MIXED FRUIT 200ML,60,1,545,2021,11,2


In [476]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Train_y = le.fit_transform(Train_y)

In [477]:
# remove the substring "SHOP" from the shop_id column and convert it to int
Train["shop_id"] = Train["shop_id"].str.replace("SHOP", "").astype(int)
# Train["shop_id"] = Train["shop_id"].str.replace("SHOP", "")
# Train["shop_id"] = Train["shop_id"].astype()")
# convert it to categorical
Train["shop_id"] = Train["shop_id"].astype("category")

In [478]:
# remove the substring "SHOP" from the shop_id column
Submission_testing["shop_id"] = Submission_testing["shop_id"].str.replace("SHOP", "").astype(int)
StoreInfo["shop_id"] = StoreInfo["shop_id"].str.replace("SHOP", "").astype(int)

In [479]:
## Splitting the dataset into the Training set and Test set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Train, Train_y, test_size = 0.2, random_state = 1)

In [480]:
X_train.head()

Unnamed: 0,item_description,shop_id,item_price,quantity_sold,shop_area_sq_ft,year,month,weekday
13201,SODA 1L,86,110,2,661,2021,10,5
293065,LEMONADE 500ML,37,200,2,717,2021,12,5
63548,LEMONADE 1.5L,66,220,3,470,2021,12,0
466484,GINGER BEER 1.5L,60,220,1,676,2021,10,2
316348,SODA 1L,112,220,3,668,2021,11,0


In [481]:
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

In [482]:
# X_train.info()

In [483]:
Submission_testing = Submission_testing[Train.columns]

In [484]:
# X_train.head()

In [485]:
# unique values in each column
X_train.nunique()

item_description     37
shop_id             100
item_price          172
quantity_sold        59
shop_area_sq_ft      53
year                  1
month                 3
weekday               7
dtype: int64

In [486]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 266064 entries, 13201 to 182378
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   item_description  266064 non-null  category
 1   shop_id           266064 non-null  category
 2   item_price        266064 non-null  int64   
 3   quantity_sold     266064 non-null  int64   
 4   shop_area_sq_ft   266064 non-null  int64   
 5   year              266064 non-null  int64   
 6   month             266064 non-null  int64   
 7   weekday           266064 non-null  int64   
dtypes: category(2), int64(6)
memory usage: 14.7 MB


In [487]:
categorical_columns = ["year", "month", "weekday"]

for category in categorical_columns:
    Train[category] = Train[category].astype("category")

# Metrics

In [488]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


def score_classification(model, df1_x, df1_y, df2_x, df2_y):
  print(model)
  df1_x = df1_x.copy()
  df2_x = df2_x.copy()
  
  ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(handle_unknown='ignore'), [0])], remainder='passthrough')
 
  # One hot encoding for the item_description column

  df1_encoded = ct.fit_transform(df1_x[["item_description"]])
  df1_x = pd.concat([df1_x.drop("item_description", axis=1).reset_index(drop=True), pd.DataFrame(df1_encoded.toarray())], axis='columns')
 
  df2_encoded = ct.transform(df2_x[["item_description"]]) 
  df2_x = pd.concat([df2_x.drop("item_description", axis=1).reset_index(drop=True), pd.DataFrame(df2_encoded.toarray())], axis='columns')    
  
  df1_x.columns = df1_x.columns.astype(str)
  df2_x.columns = df2_x.columns.astype(str)
 
  # Fit the model  
  model.fit(df1_x, df1_y)

  # Make predictions
  y_pred = model.predict(df2_x)

  # Evaluate the model
  accuracy = accuracy_score(df2_y, y_pred)
  precision = precision_score(df2_y, y_pred,  average='macro')
  recall = recall_score(df2_y, y_pred,  average='macro')
  f1 = f1_score(df2_y, y_pred,  average='macro')

  print(f"Accuracy: {accuracy}")
  print(f"Precision: {precision}")
  print(f"Recall: {recall}")
  print(f"F1 Score: {f1}")

In [489]:
Submission_testing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86633 entries, 0 to 86632
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   item_description  80319 non-null  category
 1   shop_id           86633 non-null  int32   
 2   item_price        86633 non-null  int64   
 3   quantity_sold     86633 non-null  int64   
 4   shop_area_sq_ft   86633 non-null  int64   
 5   year              86633 non-null  int64   
 6   month             86633 non-null  int64   
 7   weekday           86633 non-null  int64   
dtypes: category(1), int32(1), int64(6)
memory usage: 5.0 MB


## Result Output

In [490]:
def outputResult(model, df1_x, df1_y, test, le):
    print(model)
    df1_x = df1_x.copy()
    df2_x = test.copy()
    
    ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(handle_unknown='ignore'), [0])], remainder='passthrough')

    # One hot encoding for the item_description column

    df1_encoded = ct.fit_transform(df1_x[["item_description"]])
    df1_x = pd.concat([df1_x.drop("item_description", axis=1).reset_index(drop=True), pd.DataFrame(df1_encoded.toarray())], axis='columns')

    df2_encoded = ct.transform(df2_x[["item_description"]])
    df2_x = pd.concat([df2_x.drop("item_description", axis=1).reset_index(drop=True), pd.DataFrame(df2_encoded.toarray())], axis='columns')

    df1_x.columns = df1_x.columns.astype(str)
    df2_x.columns = df2_x.columns.astype(str)

    # Fit the model on the training data
    model.fit(df1_x, df1_y)
    
    # Make predictions on the test data
    y_pred = model.predict(df2_x)
    
    # Inverse transform the encoded predictions to the original shop profiles
    y_pred = le.inverse_transform(y_pred)    
    
    # Create a dataframe with the predicted shop profiles
    results_df = pd.DataFrame({'shop_id': test.shop_id, 'shop_profile': y_pred})    
    # rename the shop_id column with the word "SHOP" and add a 0 in front of the shop_id
    results_df["shop_id"] = results_df["shop_id"].astype(str)
    results_df["shop_id"] = "SHOP" + results_df["shop_id"].str.zfill(3)
    
    # group the dataframe by shop_id and use the most frequent shop_profile
    results_df = results_df.groupby('shop_id', sort=False)['shop_profile'].agg(lambda x:x.value_counts().index[0]).reset_index()

    results_df.drop_duplicates(inplace=True)
    # Write the dataframe to a CSV file
    results_df.to_csv('predictions'+str(model).strip("()")+'.csv', index=False)

# Modelling

In [491]:
# unique values in item_description column

print(X_train['item_description'].unique())

# no of null values in item_description column

print(X_train['item_description'].isnull().sum())

['SODA 1L', 'LEMONADE 500ML', 'LEMONADE 1.5L', 'GINGER BEER 1.5L', 'SODA 500ML', ..., 'GINGER BEER SUGAR FREE 500ML', 'LEMONADE 1L', 'NECTO 1L', 'CHOCOLATE MILK 180ML', 'LIME CRUSH JUICE 1L']
Length: 37
Categories (37, object): ['BOTTLED DRINKING WATER 1.5L', 'BOTTLED DRINKING WATER 500ML', 'CHOCOLATE MILK 180ML', 'CREAM SODA 1.5L', ..., 'SODA PET 1.5L', 'STRAWBERRY MILK 180ML', 'TONIC PET 500ML', 'TWISTEE APPLE 1L']
0


In [492]:
from sklearn.linear_model import LogisticRegression
import sys
logr = LogisticRegression()
# np.set_printoptions(threshold=np.inf)
# score_classification(logr, X_train, y_train, X_test, y_test)
# np.set_printoptions(threshold=1000)

In [493]:
Submission_testing.head()

Unnamed: 0,item_description,shop_id,item_price,quantity_sold,shop_area_sq_ft,year,month,weekday
0,FIT O MIXED FRUIT 1L,46,270,1,545,2021,10,4
1,FIT O ORANGE 1L,46,290,1,545,2021,10,2
2,LEMONADE 1.5L,46,220,2,545,2021,10,3
3,FIT O MANGO 200ML,46,180,4,545,2021,11,2
4,FIT O MIXED FRUIT 200ML,46,60,1,545,2021,11,2


In [494]:
# outputResult(logr, X_train, y_train, Submission_testing, le)

In [495]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()

score_classification(dtc, X_train, y_train, X_test, y_test)

DecisionTreeClassifier()
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0


In [496]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()

# score_classification(rfc, X_train, y_train, X_test, y_test)

Bug with KNeighborsClassifiers

In [497]:
# #Dont use this
# from sklearn.neighbors import KNeighborsClassifier

# knn = KNeighborsClassifier()

# score_classification(knn, X_train, y_train, X_test, y_test)

In [498]:
# outputResult(knn, X_train, y_train, Submission_testing, le)

In [499]:
# from sklearn.naive_bayes import GaussianNB

# nb = GaussianNB()

# score_classification(nb, X_train, y_train, X_test, y_test)

In [500]:
# import xgboost as xgb

# xgb_model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, max_depth=5)

# score_classification(xgb_model, X_train, y_train, X_test, y_test)


# Feature Engineering

Mutual Information

In [501]:
from sklearn.feature_selection import mutual_info_regression

def make_mi_scores(X,y):
  X = X.copy()
  for colname in X.select_dtypes(["object", "category"]):
    X[colname], _ = X[colname].factorize()
  # all discrete features should now have integer dtypes
  discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
  mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
  mi_scores = pd.Series(mi_scores, name="Mutual Information Scores", index=X.columns)
  mi_scores = mi_scores.sort_values(ascending=False)
  return mi_scores


def plot_mi_scores(scores):
  scores = scores.sort_values(ascending=True)
  width = np.arange(len(scores))
  ticks = list(scores.index)
  plt.barh(width,scores)
  plt.yticks(width, ticks)
  plt.title("Mututal Information Scores")

In [502]:
# mi_scores = make_mi_scores(X_train, y_train)
# mi_scores

In [503]:
X_train['year'].unique()

array([2021], dtype=int64)

In [504]:
X_train.drop('year', axis=1, inplace=True)
X_test.drop('year', axis=1, inplace=True)
Submission_testing.drop('year', axis=1, inplace=True)

In [505]:
X_train.head()

Unnamed: 0,item_description,shop_id,item_price,quantity_sold,shop_area_sq_ft,month,weekday
13201,SODA 1L,86,110,2,661,10,5
293065,LEMONADE 500ML,37,200,2,717,12,5
63548,LEMONADE 1.5L,66,220,3,470,12,0
466484,GINGER BEER 1.5L,60,220,1,676,10,2
316348,SODA 1L,112,220,3,668,11,0


Feature Scaling

In [506]:
X_train.head()

Unnamed: 0,item_description,shop_id,item_price,quantity_sold,shop_area_sq_ft,month,weekday
13201,SODA 1L,86,110,2,661,10,5
293065,LEMONADE 500ML,37,200,2,717,12,5
63548,LEMONADE 1.5L,66,220,3,470,12,0
466484,GINGER BEER 1.5L,60,220,1,676,10,2
316348,SODA 1L,112,220,3,668,11,0


In [507]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

ct2 = ColumnTransformer([("sScaler", StandardScaler(), []),
                         ("MinMaxScaler", MinMaxScaler(), ["shop_area_sq_ft","item_price", "quantity_sold", "month" , "weekday"])])
scaled_df1_x = pd.DataFrame(ct2.fit_transform(X_train), columns=["shop_area_sq_ft", "item_price", "quantity_sold","month" , "weekday"])
scaled_df2_x = pd.DataFrame(ct2.transform(X_test), columns=["shop_area_sq_ft", "item_price", "quantity_sold","month" , "weekday"])
scaled_submission_df_x = pd.DataFrame(ct2.transform(Submission_testing), columns=["shop_area_sq_ft", "item_price", "quantity_sold","month" , "weekday"])

# Create copies of X_train and X_test with the scaled values
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_submission_scaled = Submission_testing.copy()


# Assign the scaled values to the new dataframes
X_train_scaled[scaled_df1_x.columns] = scaled_df1_x
X_test_scaled[scaled_df2_x.columns] = scaled_df2_x
X_submission_scaled[scaled_submission_df_x.columns] = scaled_submission_df_x

In [508]:
X_train.head()

Unnamed: 0,item_description,shop_id,item_price,quantity_sold,shop_area_sq_ft,month,weekday
0,SODA 1L,86,110,2,661,10,5
1,LEMONADE 500ML,37,200,2,717,12,5
2,LEMONADE 1.5L,66,220,3,470,12,0
3,GINGER BEER 1.5L,60,220,1,676,10,2
4,SODA 1L,112,220,3,668,11,0


In [509]:
X_test_scaled.head()

Unnamed: 0,item_description,shop_id,item_price,quantity_sold,shop_area_sq_ft,month,weekday
0,GINGER BEER 1L,10,0.006623,0.019608,0.409499,1.0,0.666667
1,SODA 500ML,127,0.002016,0.019608,0.706033,0.0,0.166667
2,GINGER BEER 1.5L,84,0.010654,0.019608,0.409499,0.0,0.166667
3,GINGER BEER 1.5L,17,0.010654,0.019608,0.409499,1.0,0.166667
4,SODA 1L,83,0.004319,0.029412,0.485237,1.0,0.166667


In [510]:
print("Scores after standard scaling: ")
# score_classification(rfc, X_train_scaled, y_train, X_test_scaled, y_test)

Scores after standard scaling: 


In [511]:
# score_classification(dtc, X_train_scaled, y_train, X_test_scaled, y_test)

In [512]:
from sklearn.decomposition import PCA

def apply_pca(X):
  pca= PCA()
  df1_x = X.copy()
  ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(handle_unknown='ignore'), [0])], remainder='passthrough')

  # One hot encoding for the item_description column

  df1_encoded = ct.fit_transform(df1_x[["item_description"]])
  df1_x = pd.concat([df1_x.drop("item_description", axis=1).reset_index(drop=True), pd.DataFrame(df1_encoded.toarray())], axis='columns')

  df1_x.columns = df1_x.columns.astype(str)

  X_pca = pca.fit_transform(df1_x)
  component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
  X_pca = pd.DataFrame(X_pca, columns=component_names)
  #create loadings
  loadings = pd.DataFrame(
  pca.components_.T,
  columns = component_names,
  index=df1_x.columns,
  )
  return pca, X_pca, loadings

def plot_variance(pca, width=8, dpi=100):
  fig, axs = plt.subplots(1,2)
  n = pca.n_components_
  grid = np.arange(1, n+1)
  evr = pca.explained_variance_ratio_
  axs[0].bar(grid,evr)
  axs[0].set(
      xlabel="Component",title="% Explained Variance", ylim=(0.0,1.0)             )
  #Cumulative Variance
  cv = np.cumsum(evr)
  axs[1].plot(np.r_[0,grid], np.r_[0,cv], "o-")
  axs[1].set(xlabel="Component", title="% Cumulatve Variance", ylim=(0.0, 1.0))
  fig.set(figwidth=8, dpi=100)
  return axs

In [513]:
# pca, X_pca, loadings = apply_pca(X_train_scaled)
# print(loadings)

In [514]:
# pca, X_pca, loadings = apply_pca(X_train_scaled)
# print(loadings)

In [515]:
X_train_scaled.head()

Unnamed: 0,item_description,shop_id,item_price,quantity_sold,shop_area_sq_ft,month,weekday
0,SODA 1L,86,0.004319,0.029412,0.465982,0.0,0.833333
1,LEMONADE 500ML,37,0.009502,0.029412,0.537869,1.0,0.833333
2,LEMONADE 1.5L,66,0.010654,0.039216,0.220796,1.0,0.0
3,GINGER BEER 1.5L,60,0.010654,0.019608,0.485237,0.0,0.333333
4,SODA 1L,112,0.010654,0.039216,0.474968,0.5,0.0


In [516]:
# Create price-related features
X_train_scaled['total_sales']= X_train_scaled['item_price'] * X_train_scaled['quantity_sold']
X_test_scaled['total_sales']= X_test_scaled['item_price'] * X_test_scaled['quantity_sold']
X_submission_scaled['total_sales']= X_submission_scaled['item_price'] * X_submission_scaled['quantity_sold']

# Aggregate X_train_scaled by shop_id and add all the total_sales values
X_train_cumSales = X_train_scaled.groupby('shop_id').agg({'total_sales':'sum'}).reset_index()
X_test_cumSales = X_test_scaled.groupby('shop_id').agg({'total_sales':'sum'}).reset_index()
X_submission_cumSales = X_submission_scaled.groupby('shop_id').agg({'total_sales':'sum'}).reset_index()

# add the column with mode of the area of the shop
X_train_cumSales['shop_area_sq_ft'] = X_train_scaled.groupby('shop_id')['shop_area_sq_ft'].agg(lambda x:x.value_counts().index[0]).reset_index()['shop_area_sq_ft']
X_test_cumSales['shop_area_sq_ft'] = X_test_scaled.groupby('shop_id')['shop_area_sq_ft'].agg(lambda x:x.value_counts().index[0]).reset_index()['shop_area_sq_ft']
X_submission_cumSales['shop_area_sq_ft'] = X_submission_scaled.groupby('shop_id')['shop_area_sq_ft'].agg(lambda x:x.value_counts().index[0]).reset_index()['shop_area_sq_ft']

# # new feature: total_sales per sq ft of the shop
# X_train_cumSales['total_sales_per_sq_ft'] = X_train_cumSales['total_sales'] / X_train_cumSales['shop_area_sq_ft']
# X_test_cumSales['total_sales_per_sq_ft'] = X_test_cumSales['total_sales'] / X_test_cumSales['shop_area_sq_ft']
# X_submission_cumSales['total_sales_per_sq_ft'] = X_submission_cumSales['total_sales'] / X_submission_cumSales['shop_area_sq_ft']

# # new feature: addition of the total_sales and total_sales_per_sq_ft
# X_train_cumSales['total_sales_plus_total_sales_per_sq_ft'] = X_train_cumSales['total_sales'] + X_train_cumSales['total_sales_per_sq_ft']
# X_test_cumSales['total_sales_plus_total_sales_per_sq_ft'] = X_test_cumSales['total_sales'] + X_test_cumSales['total_sales_per_sq_ft']
# X_submission_cumSales['total_sales_plus_total_sales_per_sq_ft'] = X_submission_cumSales['total_sales'] + X_submission_cumSales['total_sales_per_sq_ft']

# # new feature : multiplication of the total_sales and total_sales_per_sq_ft
# X_train_cumSales['total_sales_times_total_sales_per_sq_ft'] = X_train_cumSales['total_sales'] * X_train_cumSales['total_sales_per_sq_ft']
# X_test_cumSales['total_sales_times_total_sales_per_sq_ft'] = X_test_cumSales['total_sales'] * X_test_cumSales['total_sales_per_sq_ft']
# X_submission_cumSales['total_sales_times_total_sales_per_sq_ft'] = X_submission_cumSales['total_sales'] * X_submission_cumSales['total_sales_per_sq_ft']


In [520]:
y_new_train = X_train_cumSales.merge(StoreInfo, on='shop_id', how='left')
y_new_train = y_new_train["shop_profile"]
y_new_train = le.transform(y_new_train)

y_new_test = X_test_cumSales.merge(StoreInfo, on='shop_id', how='left')
y_new_test = y_new_test["shop_profile"]
y_new_test = le.transform(y_new_test)

In [521]:
len(X_train_cumSales), len(y_new_train), len(X_test_cumSales), len(y_new_test)

(100, 100, 100, 100)

In [350]:
# remove the redundant columns

X_train_scaled.drop(['item_description', 'item_price', 'quantity_sold', 'month', 'weekday'], axis=1, inplace=True)
X_test_scaled.drop(['item_description', 'item_price', 'quantity_sold', 'month', 'weekday'], axis=1, inplace=True)
X_submission_scaled.drop(['item_description', 'item_price', 'quantity_sold', 'month', 'weekday'], axis=1, inplace=True)

In [351]:
X_train_cumSales.head()

Unnamed: 0,shop_id,total_sales,shop_area_sq_ft
0,1,0.574996,0.362003
1,3,0.939748,0.657253
2,4,0.812641,0.279846
3,5,0.957817,0.409499
4,6,0.872233,0.485237


In [348]:
len(y_train)

266064

In [590]:
X_train_scaled.describe()

Unnamed: 0,shop_area_sq_ft,year,month,weekday,total_sales,shop_sales_rank
count,266064.0,266064.0,266064.0,266064.0,266064.0,266064.0
mean,0.430169,2021.0,11.024182,3.341437,0.000379,1449.060155
std,0.159823,0.0,0.776629,2.012698,0.001923,940.751166
min,0.0,2021.0,10.0,0.0,0.0,1.0
25%,0.394095,2021.0,10.0,2.0,0.00011,694.5
50%,0.409499,2021.0,11.0,4.0,0.000209,1341.0
75%,0.485237,2021.0,12.0,5.0,0.000313,2091.5
max,1.0,2021.0,12.0,6.0,0.607843,4513.0


In [591]:
#check for missing values

X_train_scaled.isnull().sum()

item_description    0
shop_id             0
shop_area_sq_ft     0
year                0
month               0
weekday             0
total_sales         0
shop_sales_rank     0
dtype: int64

In [593]:
X_submission_scaled.isnull().sum()

item_description    6314
shop_id                0
shop_area_sq_ft        0
year                   0
month                  0
weekday                0
total_sales            0
shop_sales_rank        0
dtype: int64

In [594]:
# drop the rows with missing values

X_train_scaled.dropna(inplace=True)
X_test_scaled.dropna(inplace=True)
X_submission_scaled.dropna(inplace=True)

In [None]:
# # Impute missing values for the shop_sales_per_sq_ft column

# from sklearn.impute import SimpleImputer

# imputer = SimpleImputer(missing_values = np.nan, strategy="mean")
# imputer.fit(X_train_scaled[["shop_sales_per_sq_ft"]])

# X_train_scaled = pd.DataFrame(imputer.transform(X_train_scaled), columns=["shop_sales_per_sq_ft"]])
# X_test_scaled = pd.DataFrame(imputer.transform(X_test_scaled), columns=X_train_scaled.columns)
# X_submission_scaled = pd.DataFrame(imputer.transform(X_submission_scaled), columns=X_submission_scaled.columns)

In [626]:
score_classification(dtc, X_train_scaled, y_train, X_test_scaled, y_test)

DecisionTreeClassifier()
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0


In [628]:
outputResult(dtc, X_train_scaled, y_train, X_submission_scaled, le)

DecisionTreeClassifier()


In [629]:
score_classification(rfc, X_train_scaled, y_train, X_test_scaled, y_test)

RandomForestClassifier()
Accuracy: 0.7716523595471835
Precision: 0.8076157953943078
Recall: 0.7880105700882468
F1 Score: 0.7681265650899315


In [632]:
outputResult(rfc, X_train_scaled, y_train, X_submission_scaled, le)

RandomForestClassifier()


In [633]:
score_classification(knn, X_train_scaled, y_train, X_test_scaled, y_test)

KNeighborsClassifier()
Accuracy: 0.5093134086023122
Precision: 0.5081336157350509
Recall: 0.5097537277825253
F1 Score: 0.5085792450709201


In [634]:
outputResult(knn, X_train_scaled, y_train, X_submission_scaled, le)

KNeighborsClassifier()


In [1]:
Submission_testing.head()

NameError: name 'Submission_testing' is not defined