In [1098]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

In [1099]:
# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

warnings.filterwarnings('ignore')

In [1100]:
path = "../Datasets/"
Train = pd.read_csv(path + '/Historical-transaction-data.csv')
StoreInfo = pd.read_csv(path + '/Store-info.csv')
Submission_testing = pd.read_csv(path + '/Testing-data.csv')

In [1101]:
Train = Train.merge(StoreInfo, on='shop_id', how='left')

In [1102]:
Submission_testing.drop("shop_profile", axis=1, inplace=True)
# Submission_testing = Submission_testing.merge(StoreInfo, on='shop_id', how='left')
# Submission_testing.drop("shop_profile", axis=1, inplace=True)
Submission_testing = Submission_testing.merge(Train, on='shop_id', how='left')

In [1103]:
Submission_testing.drop("shop_profile", axis=1, inplace=True)

In [1104]:
Train.drop_duplicates(inplace=True)

In [1106]:
Train.to_csv('CombinedData.csv', index=False)

In [1045]:
redundant_cols = ['item_description', 'transaction_date', 'invoice_id', 'customer_id']

Train.drop(redundant_cols, axis=1, inplace=True)
Submission_testing.drop(redundant_cols, axis=1, inplace=True)

In [1046]:
features_nom = ["shop_id", "shop_profile]

for feature in features_nom:
    Train[feature] = Train[feature].astype("category")
    if feature == "shop_profile":
        continue
    Submission_testing[feature] = Submission_testing[feature].astype("category")

In [1047]:
Train["shop_id"] = Train["shop_id"].str.replace("SHOP", "").astype(int).astype("category")

In [1048]:
Submission_testing["shop_id"] = Submission_testing["shop_id"].str.replace("SHOP", "").astype(int).astype("category")
StoreInfo["shop_id"] = StoreInfo["shop_id"].str.replace("SHOP", "").astype(int).astype("category")

In [1049]:
#Remove rows with null values for item_description or shop_profile
Train = Train.dropna(subset=['shop_profile'], axis=0)

In [1050]:
# Create price-related features
Train['total_sales']= Train['item_price'] * Train['quantity_sold']
Submission_testing['total_sales']= Submission_testing['item_price'] * Submission_testing['quantity_sold']

# Aggregate X_train_scaled by shop_id and add all the total_sales values and prevent empty values

Train_cum = Train.groupby(['shop_id'], sort=False).agg({'total_sales':'sum'})
Submission_testing_cum = Submission_testing.groupby(['shop_id'], sort='False').agg({'total_sales':'sum'})


In [1051]:

Train_cum = Train_cum.merge(StoreInfo, on='shop_id', how='left')
Train_cum = Train_cum.dropna(subset=['shop_profile'], axis=0)
Train_cum.drop("shop_profile", axis=1, inplace=True)

Submission_testing_cum = Submission_testing_cum.merge(StoreInfo, on='shop_id', how='left')
Submission_testing_cum.drop("shop_profile", axis=1, inplace=True)


In [1052]:
# new feature: total_sales per sq ft of the shop
Train_cum['sales_per_sq_ft'] = Train_cum['total_sales'] / Train_cum['shop_area_sq_ft']
Submission_testing_cum['sales_per_sq_ft'] = Submission_testing_cum['total_sales'] / Submission_testing_cum['shop_area_sq_ft']

# new feature : difference between the total_sales and total_sales_per_sq_ft
Train_cum['sales_minus_shop_area'] = Train_cum['total_sales'] - Train_cum['shop_area_sq_ft']
Submission_testing_cum['sales_minus_shop_area'] = Submission_testing_cum['total_sales'] - Submission_testing_cum['shop_area_sq_ft']

# new feature: addition of the total_sales and total_sales_per_sq_ft
Train_cum['sales_plus_shop_area'] = Train_cum['total_sales'] + Train_cum['shop_area_sq_ft']
Submission_testing_cum['sales_plus_shop_area'] = Submission_testing_cum['total_sales'] + Submission_testing_cum['shop_area_sq_ft']

# new feature : multiplication of the total_sales and total_sales_per_sq_ft
Train_cum['sales_times_shop_area'] = Train_cum['total_sales'] * Train_cum['shop_area_sq_ft']
Submission_testing_cum['sales_times_shop_area'] = Submission_testing_cum['total_sales'] * Submission_testing_cum['shop_area_sq_ft']

In [1053]:
# Remove rows with zero total_sales

Train_cum = Train_cum[Train_cum['total_sales'] != 0]
Submission_testing_cum = Submission_testing_cum[Submission_testing_cum['total_sales'] != 0]

Train_cum.drop("shop_area_sq_ft", axis=1, inplace=True)
Train_cum = Train_cum.merge(StoreInfo, on='shop_id', how='left')
Submission_testing_cum.drop("shop_area_sq_ft", axis=1, inplace=True)
Submission_testing_cum = Submission_testing_cum.merge(StoreInfo, on='shop_id', how='left')

In [1054]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [1055]:
y_new_train = Train_cum["shop_profile"]
Train_cum.drop("shop_profile", axis=1, inplace=True)
Submission_testing_cum.drop("shop_profile", axis=1, inplace=True)
y_new_train = le.fit_transform(y_new_train)

# Feature Scaling

In [1056]:
Submission_testing_cum.head()

Unnamed: 0,shop_id,total_sales,sales_per_sq_ft,sales_minus_shop_area,sales_plus_shop_area,sales_times_shop_area,shop_area_sq_ft
0,2,3084455,4842.158556,3083818,3085092,1964797835,637
1,19,2200580,5354.209246,2200169,2200991,904438380,411
2,23,2570425,4166.004862,2569808,2571042,1585952225,617
3,24,1308795,1936.087278,1308119,1309471,884745420,676
4,29,1973805,3306.20603,1973208,1974402,1178361585,597


In [1057]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler

scaled_columns = ["total_sales", "sales_per_sq_ft", "sales_minus_shop_area", "sales_plus_shop_area", "sales_times_shop_area", "shop_area_sq_ft"]
ct = ColumnTransformer([("MinMaxScaler", MinMaxScaler(), scaled_columns)], remainder="passthrough")


scaled_X_train = pd.DataFrame(ct.fit_transform(Train_cum), columns=scaled_columns+["shop_id"])
scaled_X_submission = pd.DataFrame(ct.transform(Submission_testing_cum), columns=scaled_columns+["shop_id"])


In [1058]:
## Splitting the dataset into the Training set and Test set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(scaled_X_train, y_new_train, test_size = 0.2, random_state = 1)

# Metrics

In [1059]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder

def score_classification(model, df1_x, df1_y, df2_x, df2_y):
  print(model)
  df1_x = df1_x.copy()
  df2_x = df2_x.copy() 
  ct2 = ColumnTransformer(transformers=[('encoder', OneHotEncoder(handle_unknown='ignore'), [0])], remainder='passthrough')

  # One hot encoding for the item_description column

  df1_encoded = ct2.fit_transform(df1_x[["shop_id"]])
  df1_x = pd.concat([df1_x.drop("shop_id", axis=1).reset_index(drop=True), pd.DataFrame(df1_encoded.toarray())], axis='columns')

  df2_encoded = ct2.transform(df2_x[["shop_id"]])
  df2_x = pd.concat([df2_x.drop("shop_id", axis=1).reset_index(drop=True), pd.DataFrame(df2_encoded.toarray())], axis='columns')

  df1_x.columns = df1_x.columns.astype(str)
  df2_x.columns = df2_x.columns.astype(str)
  # Fit the model  
  model.fit(df1_x, df1_y)

  # Make predictions
  y_pred = model.predict(df2_x)

  # Evaluate the model
  accuracy = accuracy_score(df2_y, y_pred)
  precision = precision_score(df2_y, y_pred,  average='macro')
  recall = recall_score(df2_y, y_pred,  average='macro')
  f1 = f1_score(df2_y, y_pred,  average='macro')

  print(f"Accuracy: {accuracy}")
  print(f"Precision: {precision}")
  print(f"Recall: {recall}")
  print(f"F1 Score: {f1}")

In [1060]:
def outputResult(model, df1_x, df1_y, test, le):
    print(model)
    df1_x = df1_x.copy()
    df2_x = test.copy()
   
    df1_x.columns = df1_x.columns.astype(str)
    df2_x.columns = df2_x.columns.astype(str)
    # Fit the model on the training data
    model.fit(df1_x, df1_y)
    
    # Make predictions on the test data
    y_pred = model.predict(df2_x)
    
    # Inverse transform the encoded predictions to the original shop profiles
    y_pred = le.inverse_transform(y_pred)    
    
    # Create a dataframe with the predicted shop profiles
    results_df = pd.DataFrame({'shop_id': test.shop_id, 'shop_profile': y_pred})    
    # rename the shop_id column with the word "SHOP" and add a 0 in front of the shop_id
    results_df["shop_id"] = results_df["shop_id"].astype(int).astype(str)
    results_df["shop_id"] = "SHOP" + results_df["shop_id"].str.zfill(3)
    # results_df.rename(columns={'shop_id': 'SHOP0' + results_df.shop_id.astype(str)}, inplace=True)
    results_df.drop_duplicates(inplace=True)
    # Write the dataframe to a CSV file
    results_df.to_csv('predictions'+str(model).strip("()")+'.csv', index=False)

# Modelling

In [1061]:
from sklearn.linear_model import LogisticRegression

logr = LogisticRegression()

In [1062]:
score_classification(logr, X_train, y_train, X_test, y_test)

LogisticRegression()
Accuracy: 0.25
Precision: 0.2161172161172161
Recall: 0.3333333333333333
F1 Score: 0.24679487179487178


In [1063]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()


In [1064]:

score_classification(dtc, X_train, y_train, X_test, y_test)

DecisionTreeClassifier()
Accuracy: 0.3
Precision: 0.3333333333333333
Recall: 0.2828282828282828
F1 Score: 0.2793650793650794


In [1065]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()

In [1066]:
score_classification(rfc, X_train, y_train, X_test, y_test)

RandomForestClassifier()
Accuracy: 0.35
Precision: 0.4166666666666667
Recall: 0.3131313131313131
F1 Score: 0.3238095238095238


In [1067]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

In [1068]:
score_classification(knn, X_train, y_train, X_test, y_test)

KNeighborsClassifier()
Accuracy: 0.45
Precision: 0.37566137566137564
Recall: 0.3737373737373737
F1 Score: 0.3717948717948718


In [1069]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import numpy as np

# Define the parameter grid to search over
param_grid = {
    'knn__n_neighbors': [3, 5, 7, 9, 11],
    'knn__weights': ['uniform', 'distance'],
    'knn__p': [1, 2],
    'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'knn__leaf_size': [10, 20, 30, 40, 50],
    'knn__metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski']
}

# Create a pipeline to preprocess the data and apply KNN
pipeline = Pipeline([ 
    ('knn', KNeighborsClassifier())
])

# Create a grid search object to find the best parameters
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)


# Fit the grid search object on the data
grid_search.fit(X_train, y_train)

# Print the best parameters found
print("Best parameters:", grid_search.best_params_)


Best parameters: {'knn__algorithm': 'auto', 'knn__leaf_size': 10, 'knn__metric': 'euclidean', 'knn__n_neighbors': 5, 'knn__p': 1, 'knn__weights': 'distance'}


In [1070]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=False).split(range(25))

from sklearn.model_selection import cross_val_score

knn2 = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy')
print(scores)

[0.5   0.375 0.375 0.375 0.25  0.375 0.25  0.125 0.5   0.375]


In [1071]:
# Set the best parameters found by GridSearchCV
knn.set_params(**{'algorithm': 'auto', 'leaf_size': 10, 'metric': 'euclidean', 'n_neighbors': 5, 'p': 1, 'weights': 'distance'})

score_classification(knn, X_train, y_train, X_test, y_test)

KNeighborsClassifier(leaf_size=10, metric='euclidean', p=1, weights='distance')
Accuracy: 0.5
Precision: 0.5079365079365079
Recall: 0.5656565656565656
F1 Score: 0.5037037037037037


In [1072]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()


In [1073]:

score_classification(nb, X_train, y_train, X_test, y_test)

GaussianNB()
Accuracy: 0.7
Precision: 0.4468864468864469
Recall: 0.5252525252525252
F1 Score: 0.4829059829059828


In [1074]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, max_depth=5)

In [1075]:

score_classification(xgb_model, X_train, y_train, X_test, y_test)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)
Accuracy: 0.4
Precision: 0.4305555555555555
Recall: 0.398989898989899
F1 Score: 0.38413547237076656


In [1076]:
# Create a list of classifiers to compare
classifiers = [logr, dtc, knn, rfc, nb, xgb_model]

# Create X and y data

# Cross-validation
for clf in classifiers:
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring=lambda clf, X, y: f1_score(y, clf.predict(X), average='macro'))
    print(clf.__class__.__name__)
    print("F1 Score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


LogisticRegression
F1 Score: 0.45 (+/- 0.16)
DecisionTreeClassifier
F1 Score: 0.35 (+/- 0.18)
KNeighborsClassifier
F1 Score: 0.32 (+/- 0.20)
RandomForestClassifier
F1 Score: 0.44 (+/- 0.08)
GaussianNB
F1 Score: 0.52 (+/- 0.25)
XGBClassifier
F1 Score: 0.45 (+/- 0.10)


In [1077]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 80 entries, 2 to 37
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   total_sales            80 non-null     float64
 1   sales_per_sq_ft        80 non-null     float64
 2   sales_minus_shop_area  80 non-null     float64
 3   sales_plus_shop_area   80 non-null     float64
 4   sales_times_shop_area  80 non-null     float64
 5   shop_area_sq_ft        80 non-null     float64
 6   shop_id                80 non-null     float64
dtypes: float64(7)
memory usage: 5.0 KB


# Feature Engineering

In [1078]:
from sklearn.feature_selection import mutual_info_regression

def make_mi_scores(X,y):
  X = X.copy()
  for colname in X.select_dtypes(["object", "category"]):
    X[colname], _ = X[colname].factorize()
  # all discrete features should now have integer dtypes
  discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
  mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
  mi_scores = pd.Series(mi_scores, name="Mutual Information Scores", index=X.columns)
  mi_scores = mi_scores.sort_values(ascending=False)
  return mi_scores


def plot_mi_scores(scores):
  scores = scores.sort_values(ascending=True)
  width = np.arange(len(scores))
  ticks = list(scores.index)
  plt.barh(width,scores)
  plt.yticks(width, ticks)
  plt.title("Mututal Information Scores")

In [1079]:
mi_scores = make_mi_scores(X_train, y_train)
mi_scores

sales_times_shop_area    0.213485
sales_per_sq_ft          0.141548
shop_area_sq_ft          0.084893
sales_minus_shop_area    0.053917
total_sales              0.052131
sales_plus_shop_area     0.052131
shop_id                  0.037835
Name: Mutual Information Scores, dtype: float64

In [1080]:
from sklearn.decomposition import PCA

def apply_pca(X):
  pca= PCA()
  df1_x = X.copy()

  X_pca = pca.fit_transform(df1_x)
  component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
  X_pca = pd.DataFrame(X_pca, columns=component_names)
  #create loadings
  loadings = pd.DataFrame(
  pca.components_.T,
  columns = component_names,
  index=df1_x.columns,
  )
  return pca, X_pca, loadings

def plot_variance(pca, width=8, dpi=100):
  fig, axs = plt.subplots(1,2)
  n = pca.n_components_
  grid = np.arange(1, n+1)
  evr = pca.explained_variance_ratio_
  axs[0].bar(grid,evr)
  axs[0].set(
      xlabel="Component",title="% Explained Variance", ylim=(0.0,1.0)             )
  #Cumulative Variance
  cv = np.cumsum(evr)
  axs[1].plot(np.r_[0,grid], np.r_[0,cv], "o-")
  axs[1].set(xlabel="Component", title="% Cumulatve Variance", ylim=(0.0, 1.0))
  fig.set(figwidth=8, dpi=100)
  return axs

In [1081]:
X_train["shop_id"].astype("int").astype("category")

2      55
73      9
97    126
62    118
19     75
     ... 
75     45
9      12
72     10
12    115
37     49
Name: shop_id, Length: 80, dtype: category
Categories (80, int64): [3, 5, 6, 7, ..., 124, 125, 126, 127]

In [1082]:
pca, X_pca, loadings = apply_pca(X_train)
print(loadings)

                            PC1       PC2       PC3       PC4       PC5  \
total_sales            0.000341  0.460295 -0.021733 -0.098130 -0.333706   
sales_per_sq_ft        0.000078  0.419300  0.418745  0.732183  0.335779   
sales_minus_shop_area  0.000341  0.460292 -0.021621 -0.098202 -0.333685   
sales_plus_shop_area   0.000341  0.460298 -0.021846 -0.098058 -0.333726   
sales_times_shop_area  0.000543  0.433988 -0.364258 -0.376370  0.733018   
shop_area_sq_ft        0.000532 -0.015061 -0.830994  0.541631 -0.125927   
shop_id                1.000000 -0.000731  0.000630 -0.000041 -0.000016   

                                PC6           PC7  
total_sales           -4.155963e-01 -7.028132e-01  
sales_per_sq_ft       -1.163342e-15 -3.986797e-15  
sales_minus_shop_area  8.164564e-01 -8.517810e-03  
sales_plus_shop_area  -4.008473e-01  7.113234e-01  
sales_times_shop_area  4.006035e-16  3.514355e-16  
shop_area_sq_ft        1.640815e-04 -9.702942e-05  
shop_id                1.695615e-17

In [1084]:
X_train.drop(columns=['sales_plus_shop_area'], axis=1, inplace=True)
X_test.drop(columns=['sales_plus_shop_area'], axis=1, inplace=True)

In [1086]:
X_train.drop(columns=['sales_minus_shop_area'], axis=1, inplace=True)
X_test.drop(columns=['sales_minus_shop_area'], axis=1, inplace=True)


In [1087]:
for clf in classifiers:
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring=lambda clf, X, y: f1_score(y, clf.predict(X), average='macro'))
    print(clf.__class__.__name__)
    print("F1 Score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

LogisticRegression
F1 Score: 0.43 (+/- 0.21)
DecisionTreeClassifier
F1 Score: 0.37 (+/- 0.21)
KNeighborsClassifier
F1 Score: 0.31 (+/- 0.21)
RandomForestClassifier
F1 Score: 0.46 (+/- 0.12)
GaussianNB
F1 Score: 0.50 (+/- 0.31)
XGBClassifier
F1 Score: 0.45 (+/- 0.10)


In [1092]:
score_classification(knn, X_train, y_train, X_test, y_test)

KNeighborsClassifier(leaf_size=10, metric='euclidean', p=1, weights='distance')
Accuracy: 0.5
Precision: 0.5079365079365079
Recall: 0.5656565656565656
F1 Score: 0.5037037037037037


In [1094]:
X_train.drop(columns=['shop_area_sq_ft'], axis=1, inplace=True)
X_test.drop(columns=['shop_area_sq_ft'], axis=1, inplace=True)
for clf in classifiers:
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring=lambda clf, X, y: f1_score(y, clf.predict(X), average='macro'))
    print(clf.__class__.__name__)
    print("F1 Score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


LogisticRegression
F1 Score: 0.44 (+/- 0.14)
DecisionTreeClassifier
F1 Score: 0.35 (+/- 0.26)
KNeighborsClassifier
F1 Score: 0.31 (+/- 0.21)
RandomForestClassifier
F1 Score: 0.44 (+/- 0.18)
GaussianNB
F1 Score: 0.52 (+/- 0.29)
XGBClassifier
F1 Score: 0.45 (+/- 0.21)


In [1095]:
score_classification(knn, X_train, y_train, X_test, y_test)

KNeighborsClassifier(leaf_size=10, metric='euclidean', p=1, weights='distance')
Accuracy: 0.65
Precision: 0.6222222222222222
Recall: 0.6565656565656566
F1 Score: 0.6349206349206349


In [1097]:
scaled_X_submission = scaled_X_submission[X_train.columns]
scaled_X_submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   total_sales            24 non-null     float64
 1   sales_per_sq_ft        24 non-null     float64
 2   sales_times_shop_area  24 non-null     float64
 3   shop_id                24 non-null     float64
dtypes: float64(4)
memory usage: 896.0 bytes


In [49]:
outputResult(logr, X_train, y_train, scaled_X_submission, le)

LogisticRegression()
