In [1764]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

In [1765]:
# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

warnings.filterwarnings('ignore')

In [1766]:
path = "../Datasets/"
Train = pd.read_csv(path + '/Historical-transaction-data.csv')
StoreInfo = pd.read_csv(path + '/Store-info.csv')
Submission_testing = pd.read_csv(path + '/Testing-data.csv')

In [1767]:
Train = Train.merge(StoreInfo, on='shop_id', how='left')

In [1768]:
Submission_testing.drop("shop_profile", axis=1, inplace=True)
# Submission_testing = Submission_testing.merge(StoreInfo, on='shop_id', how='left')
# Submission_testing.drop("shop_profile", axis=1, inplace=True)
Submission_testing = Submission_testing.merge(Train, on='shop_id', how='left')

In [1769]:
Submission_testing.drop("shop_profile", axis=1, inplace=True)

In [1770]:
Train.drop_duplicates(inplace=True)

In [1771]:
# Train.to_csv('CombinedData.csv', index=False)

In [1772]:
# Submission_testing.to_csv('SubmissionData.csv', index=False)

In [1773]:
redundant_cols = ['transaction_date', 'invoice_id', 'customer_id']

Train.drop(redundant_cols, axis=1, inplace=True)
Submission_testing.drop(redundant_cols, axis=1, inplace=True)

In [1774]:
features_nom = ['item_description',"shop_id", "shop_profile"]

for feature in features_nom:
    Train[feature] = Train[feature].astype("category")
    if feature == "shop_profile":
        continue
    Submission_testing[feature] = Submission_testing[feature].astype("category")

In [1775]:
Train["shop_id"] = Train["shop_id"].str.replace("SHOP", "").astype(int).astype("category")

In [1776]:
Submission_testing["shop_id"] = Submission_testing["shop_id"].str.replace("SHOP", "").astype(int).astype("category")
StoreInfo["shop_id"] = StoreInfo["shop_id"].str.replace("SHOP", "").astype(int).astype("category")

In [1777]:
#Remove rows with null values for item_description or shop_profile
Train = Train.dropna(subset=['shop_profile'], axis=0)

In [1778]:
from sklearn.compose import ColumnTransformer
#import simpleimputer
from sklearn.impute import SimpleImputer

Submission_testing['shop_profile'] = 'default_value'
ct = ColumnTransformer([("SimpleImputer", SimpleImputer(strategy="constant", fill_value="missing"), ["item_description"])], remainder="passthrough")


Train = pd.DataFrame(ct.fit_transform(Train), columns=Train.columns)

Submission_testing = pd.DataFrame(ct.transform(Submission_testing), columns=Train.columns)

Submission_testing.drop("shop_profile", axis=1, inplace=True)


In [1779]:
Submission_testing.head()

Unnamed: 0,item_description,shop_id,item_price,quantity_sold,shop_area_sq_ft
0,FIT O MIXED FRUIT 1L,46,270,1,545
1,FIT O ORANGE 1L,46,290,1,545
2,LEMONADE 1.5L,46,220,2,545
3,FIT O MANGO 200ML,46,180,4,545
4,FIT O MIXED FRUIT 200ML,46,60,1,545


In [1780]:
# Create price-related features
Train['total_sales']= Train['item_price'] * Train['quantity_sold']
Submission_testing['total_sales']= Submission_testing['item_price'] * Submission_testing['quantity_sold']

# Create a new feature for the total no of unique item_description per shop_id
n_unique = Train.groupby(['shop_id'])['item_description'].nunique().reset_index()
# Create a new freature for the total no of rows per shop_id
n_count = Train.groupby(['shop_id'])['item_description'].count().reset_index()

In [1781]:
sub_n_unique = Submission_testing.groupby(['shop_id'])['item_description'].nunique().reset_index()
# Create a new freature for the total no of rows per shop_id
sub_n_count = Submission_testing.groupby(['shop_id'])['item_description'].count().reset_index()

In [1782]:
Train_cum.head()

Unnamed: 0,shop_id,total_sales,unique_items,order_count,sales_per_sq_ft,sales_minus_shop_area,sales_plus_shop_area,sales_times_shop_area,shop_area_sq_ft
0,8,126195,32,244,186.128319,125517,126873,85560210,678
1,112,1150230,36,2236,1721.901198,1149562,1150898,768353640,668
2,55,892055,37,2275,1238.965278,891335,892775,642279600,720
3,3,1830845,36,3422,2260.302469,1830035,1831655,1482984450,810
4,71,895560,36,2307,1272.102273,894856,896264,630474240,704


In [1783]:

# Aggregate X_train_scaled by shop_id and add all the total_sales values and prevent empty values

Train_cum = Train.groupby(['shop_id'], sort=False).agg({'total_sales':'sum'})
Submission_testing_cum = Submission_testing.groupby(['shop_id'], sort='False').agg({'total_sales':'sum'})


In [1784]:

Train_cum = Train_cum.merge(StoreInfo, on='shop_id', how='left')
Train_cum = Train_cum.merge(n_unique, on='shop_id', how='left')
Train_cum = Train_cum.merge(n_count, on='shop_id', how='left')
# rename item_description_x to item_description_count
Train_cum.rename(columns={'item_description_x':'unique_items', 'item_description_y':'order_count'}, inplace=True)
Train_cum.head()


Unnamed: 0,shop_id,total_sales,shop_area_sq_ft,shop_profile,unique_items,order_count
0,8,126195,678,Moderate,32,244
1,112,1150230,668,Moderate,36,2236
2,55,892055,720,Moderate,37,2275
3,3,1830845,810,High,36,3422
4,71,895560,704,Low,36,2307


In [1785]:
Train_cum = Train_cum.dropna(subset=['shop_profile'], axis=0)
Train_cum.drop("shop_profile", axis=1, inplace=True)

Submission_testing_cum = Submission_testing_cum.merge(StoreInfo, on='shop_id', how='left')
Submission_testing_cum = Submission_testing_cum.merge(sub_n_unique, on='shop_id', how='left')
Submission_testing_cum = Submission_testing_cum.merge(sub_n_count, on='shop_id', how='left')
Submission_testing_cum.rename(columns={'item_description_x':'unique_items', 'item_description_y':'order_count'}, inplace=True)
Submission_testing_cum.drop("shop_profile", axis=1, inplace=True)


In [1786]:
# new feature: total_sales per sq ft of the shop
Train_cum['sales_per_sq_ft'] = Train_cum['total_sales'] / Train_cum['shop_area_sq_ft']
Submission_testing_cum['sales_per_sq_ft'] = Submission_testing_cum['total_sales'] / Submission_testing_cum['shop_area_sq_ft']

# new feature : difference between the total_sales and total_sales_per_sq_ft
Train_cum['sales_minus_shop_area'] = Train_cum['total_sales'] - Train_cum['shop_area_sq_ft']
Submission_testing_cum['sales_minus_shop_area'] = Submission_testing_cum['total_sales'] - Submission_testing_cum['shop_area_sq_ft']

# new feature: addition of the total_sales and total_sales_per_sq_ft
Train_cum['sales_plus_shop_area'] = Train_cum['total_sales'] + Train_cum['shop_area_sq_ft']
Submission_testing_cum['sales_plus_shop_area'] = Submission_testing_cum['total_sales'] + Submission_testing_cum['shop_area_sq_ft']

# new feature : multiplication of the total_sales and total_sales_per_sq_ft
Train_cum['sales_times_shop_area'] = Train_cum['total_sales'] * Train_cum['shop_area_sq_ft']
Submission_testing_cum['sales_times_shop_area'] = Submission_testing_cum['total_sales'] * Submission_testing_cum['shop_area_sq_ft']

In [1787]:
# Remove rows with zero total_sales

Train_cum = Train_cum[Train_cum['total_sales'] != 0]
Submission_testing_cum = Submission_testing_cum[Submission_testing_cum['total_sales'] != 0]

Train_cum.drop("shop_area_sq_ft", axis=1, inplace=True)
Train_cum = Train_cum.merge(StoreInfo, on='shop_id', how='left')
Submission_testing_cum.drop("shop_area_sq_ft", axis=1, inplace=True)
Submission_testing_cum = Submission_testing_cum.merge(StoreInfo, on='shop_id', how='left')

In [1788]:
# import labelencoder
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

In [1789]:
y_new_train = Train_cum["shop_profile"]
Train_cum.drop("shop_profile", axis=1, inplace=True)
Submission_testing_cum.drop("shop_profile", axis=1, inplace=True)
y_new_train = le.fit_transform(y_new_train)

# Feature Scaling

In [1790]:
Submission_testing_cum.head()

Unnamed: 0,shop_id,total_sales,unique_items,order_count,sales_per_sq_ft,sales_minus_shop_area,sales_plus_shop_area,sales_times_shop_area,shop_area_sq_ft
0,2,3084455,36,4800,4842.158556,3083818,3085092,1964797835,637
1,19,2200580,32,3766,5354.209246,2200169,2200991,904438380,411
2,23,2570425,37,4103,4166.004862,2569808,2571042,1585952225,617
3,24,1308795,36,2883,1936.087278,1308119,1309471,884745420,676
4,29,1973805,36,3799,3306.20603,1973208,1974402,1178361585,597


In [1791]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler

scaled_columns = ["unique_items", "order_count", "total_sales", "sales_per_sq_ft", "sales_minus_shop_area", "sales_plus_shop_area", "sales_times_shop_area", "shop_area_sq_ft"]
ct = ColumnTransformer([("MinMaxScaler", MinMaxScaler(), scaled_columns)], remainder="passthrough")


scaled_X_train = pd.DataFrame(ct.fit_transform(Train_cum), columns=scaled_columns+["shop_id"])
scaled_X_submission = pd.DataFrame(ct.transform(Submission_testing_cum), columns=scaled_columns+["shop_id"])


In [1792]:
scaled_X_train.head()

Unnamed: 0,unique_items,order_count,total_sales,sales_per_sq_ft,sales_minus_shop_area,sales_plus_shop_area,sales_times_shop_area,shop_area_sq_ft,shop_id
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.487805,8
1,0.8,0.3366,0.17719,0.144192,0.17719,0.17719,0.17823,0.474968,112
2,1.0,0.34319,0.132518,0.09885,0.132509,0.132526,0.14532,0.54172,55
3,0.8,0.537006,0.294958,0.194743,0.294932,0.294984,0.36477,0.657253,3
4,0.8,0.348597,0.133124,0.101961,0.133118,0.13313,0.142239,0.521181,71


In [1793]:
# set scaled_X_train shop_id as categorical and other columns as numerical

category_columns = ["shop_id"]

for col in scaled_X_train.columns:
    if col not in category_columns:
        scaled_X_train[col] = scaled_X_train[col].astype("float")
        scaled_X_submission[col] = scaled_X_submission[col].astype("float")
    else:
        scaled_X_train[col] = scaled_X_train[col].astype("category")
        scaled_X_submission[col] = scaled_X_submission[col].astype("category")

In [1794]:
## Splitting the dataset into the Training set and Test set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(scaled_X_train, y_new_train, test_size = 0.2, random_state = 1)

# Metrics

In [1795]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder

def score_classification(model, df1_x, df1_y, df2_x, df2_y):
  print(model)
  df1_x = df1_x.copy()
  df2_x = df2_x.copy() 
  ct2 = ColumnTransformer(transformers=[('encoder', OneHotEncoder(handle_unknown='ignore'), [0])], remainder='passthrough')

  # One hot encoding for the item_description column

  df1_encoded = ct2.fit_transform(df1_x[["shop_id"]])
  df1_x = pd.concat([df1_x.drop("shop_id", axis=1).reset_index(drop=True), pd.DataFrame(df1_encoded.toarray())], axis='columns')

  df2_encoded = ct2.transform(df2_x[["shop_id"]])
  df2_x = pd.concat([df2_x.drop("shop_id", axis=1).reset_index(drop=True), pd.DataFrame(df2_encoded.toarray())], axis='columns')

  df1_x.columns = df1_x.columns.astype(str)
  df2_x.columns = df2_x.columns.astype(str)
  # Fit the model  
  model.fit(df1_x, df1_y)

  # Make predictions
  y_pred = model.predict(df2_x)

  # Evaluate the model
  accuracy = accuracy_score(df2_y, y_pred)
  precision = precision_score(df2_y, y_pred,  average='macro')
  recall = recall_score(df2_y, y_pred,  average='macro')
  f1 = f1_score(df2_y, y_pred,  average='macro')

  print(f"Accuracy: {accuracy}")
  print(f"Precision: {precision}")
  print(f"Recall: {recall}")
  print(f"F1 Score: {f1}")

In [1705]:
def outputResult(model, df1_x, df1_y, test, le):
    print(model)
    df1_x = df1_x.copy()
    df2_x = test.copy()
   
    df1_x.columns = df1_x.columns.astype(str)
    df2_x.columns = df2_x.columns.astype(str)
    # Fit the model on the training data
    model.fit(df1_x, df1_y)
    
    # Make predictions on the test data
    y_pred = model.predict(df2_x)
    
    # Inverse transform the encoded predictions to the original shop profiles
    y_pred = le.inverse_transform(y_pred)    
    
    # Create a dataframe with the predicted shop profiles
    results_df = pd.DataFrame({'shop_id': test.shop_id, 'shop_profile': y_pred})    
    # rename the shop_id column with the word "SHOP" and add a 0 in front of the shop_id
    results_df["shop_id"] = results_df["shop_id"].astype(int).astype(str)
    results_df["shop_id"] = "SHOP" + results_df["shop_id"].str.zfill(3)
    # results_df.rename(columns={'shop_id': 'SHOP0' + results_df.shop_id.astype(str)}, inplace=True)
    results_df.drop_duplicates(inplace=True)
    # Write the dataframe to a CSV file
    results_df.to_csv('predictions'+str(model).strip("()")+'.csv', index=False)

# Modelling

In [1706]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 80 entries, 2 to 37
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   unique_items           80 non-null     object
 1   order_count            80 non-null     object
 2   total_sales            80 non-null     object
 3   sales_per_sq_ft        80 non-null     object
 4   sales_minus_shop_area  80 non-null     object
 5   sales_plus_shop_area   80 non-null     object
 6   sales_times_shop_area  80 non-null     object
 7   shop_area_sq_ft        80 non-null     object
 8   shop_id                80 non-null     object
dtypes: object(9)
memory usage: 6.2+ KB


In [1743]:
from sklearn.linear_model import LogisticRegression

logr = LogisticRegression()

In [1796]:
score_classification(logr, X_train, y_train, X_test, y_test)

LogisticRegression()
Accuracy: 0.25
Precision: 0.17845117845117844
Recall: 0.3333333333333333
F1 Score: 0.2253968253968254


In [1745]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()


In [1746]:

score_classification(dtc, X_train, y_train, X_test, y_test)

DecisionTreeClassifier()
Accuracy: 0.4
Precision: 0.4047619047619047
Recall: 0.398989898989899
F1 Score: 0.3814814814814815


In [1747]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()

In [1748]:
score_classification(rfc, X_train, y_train, X_test, y_test)

RandomForestClassifier()
Accuracy: 0.3
Precision: 0.3703703703703704
Recall: 0.2828282828282828
F1 Score: 0.273015873015873


In [1749]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

In [1750]:
score_classification(knn, X_train, y_train, X_test, y_test)

KNeighborsClassifier()
Accuracy: 0.35
Precision: 0.38888888888888884
Recall: 0.3383838383838384
F1 Score: 0.3035714285714286


In [1797]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import numpy as np

# Define the parameter grid to search over
param_grid = {
    'knn__n_neighbors': [3, 5, 7, 9, 11],
    'knn__weights': ['uniform', 'distance'],
    'knn__p': [1, 2],
    'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'knn__leaf_size': [10, 20, 30, 40, 50],
    'knn__metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski']
}

# Create a pipeline to preprocess the data and apply KNN
pipeline = Pipeline([ 
    ('knn', KNeighborsClassifier())
])

# Create a grid search object to find the best parameters
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)


# Fit the grid search object on the data
grid_search.fit(X_train, y_train)

# Print the best parameters found
print("Best parameters:", grid_search.best_params_)


KeyboardInterrupt: 

In [1668]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=False).split(range(25))

from sklearn.model_selection import cross_val_score

knn2 = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy')
print(scores)

[0.5   0.375 0.375 0.375 0.25  0.375 0.25  0.125 0.5   0.375]


In [1752]:
# Set the best parameters found by GridSearchCV
knn.set_params(**{'algorithm': 'auto', 'leaf_size': 10, 'metric': 'euclidean', 'n_neighbors': 5, 'p': 1, 'weights': 'distance'})

score_classification(knn, X_train, y_train, X_test, y_test)

KNeighborsClassifier(leaf_size=10, metric='euclidean', p=1, weights='distance')
Accuracy: 0.4
Precision: 0.4490740740740741
Recall: 0.44949494949494945
F1 Score: 0.37806637806637816


In [1753]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()


In [1754]:

score_classification(nb, X_train, y_train, X_test, y_test)

GaussianNB()
Accuracy: 0.7
Precision: 0.4468864468864469
Recall: 0.5252525252525252
F1 Score: 0.4829059829059828


In [1755]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, max_depth=5)

In [1674]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 80 entries, 2 to 37
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   unique_items           80 non-null     object
 1   order_count            80 non-null     object
 2   total_sales            80 non-null     object
 3   sales_per_sq_ft        80 non-null     object
 4   sales_minus_shop_area  80 non-null     object
 5   sales_plus_shop_area   80 non-null     object
 6   sales_times_shop_area  80 non-null     object
 7   shop_area_sq_ft        80 non-null     object
 8   shop_id                80 non-null     object
dtypes: object(9)
memory usage: 6.2+ KB


In [1756]:

score_classification(xgb_model, X_train, y_train, X_test, y_test)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)
Accuracy: 0.3
Precision: 0.34656084656084657
Recall: 0.3383838383838384
F1 Score: 0.2982905982905983


In [1758]:
# Create a list of classifiers to compare
classifiers = [logr, dtc, knn, rfc, nb]

# Create X and y data

# Cross-validation
for clf in classifiers:
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring=lambda clf, X, y: f1_score(y, clf.predict(X), average='macro'))
    print(clf.__class__.__name__)
    print("F1 Score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


LogisticRegression
F1 Score: 0.49 (+/- 0.19)
DecisionTreeClassifier
F1 Score: 0.40 (+/- 0.20)
KNeighborsClassifier
F1 Score: 0.31 (+/- 0.20)
RandomForestClassifier
F1 Score: 0.43 (+/- 0.08)
GaussianNB
F1 Score: 0.53 (+/- 0.25)


In [1077]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 80 entries, 2 to 37
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   total_sales            80 non-null     float64
 1   sales_per_sq_ft        80 non-null     float64
 2   sales_minus_shop_area  80 non-null     float64
 3   sales_plus_shop_area   80 non-null     float64
 4   sales_times_shop_area  80 non-null     float64
 5   shop_area_sq_ft        80 non-null     float64
 6   shop_id                80 non-null     float64
dtypes: float64(7)
memory usage: 5.0 KB


# Feature Engineering

In [1798]:
from sklearn.feature_selection import mutual_info_regression

def make_mi_scores(X,y):
  X = X.copy()
  for colname in X.select_dtypes(["object", "category"]):
    X[colname], _ = X[colname].factorize()
  # all discrete features should now have integer dtypes
  discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
  mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
  mi_scores = pd.Series(mi_scores, name="Mutual Information Scores", index=X.columns)
  mi_scores = mi_scores.sort_values(ascending=False)
  return mi_scores


def plot_mi_scores(scores):
  scores = scores.sort_values(ascending=True)
  width = np.arange(len(scores))
  ticks = list(scores.index)
  plt.barh(width,scores)
  plt.yticks(width, ticks)
  plt.title("Mututal Information Scores")

In [1803]:
mi_scores = make_mi_scores(X_train, y_train)
mi_scores

ValueError: Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required.

In [1080]:
from sklearn.decomposition import PCA

def apply_pca(X):
  pca= PCA()
  df1_x = X.copy()

  X_pca = pca.fit_transform(df1_x)
  component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
  X_pca = pd.DataFrame(X_pca, columns=component_names)
  #create loadings
  loadings = pd.DataFrame(
  pca.components_.T,
  columns = component_names,
  index=df1_x.columns,
  )
  return pca, X_pca, loadings

def plot_variance(pca, width=8, dpi=100):
  fig, axs = plt.subplots(1,2)
  n = pca.n_components_
  grid = np.arange(1, n+1)
  evr = pca.explained_variance_ratio_
  axs[0].bar(grid,evr)
  axs[0].set(
      xlabel="Component",title="% Explained Variance", ylim=(0.0,1.0)             )
  #Cumulative Variance
  cv = np.cumsum(evr)
  axs[1].plot(np.r_[0,grid], np.r_[0,cv], "o-")
  axs[1].set(xlabel="Component", title="% Cumulatve Variance", ylim=(0.0, 1.0))
  fig.set(figwidth=8, dpi=100)
  return axs

In [1802]:
X_train["shop_id"].astype("int").astype("category")

2      55
73      9
97    126
62    118
19     75
     ... 
75     45
9      12
72     10
12    115
37     49
Name: shop_id, Length: 80, dtype: category
Categories (80, int64): [3, 5, 6, 7, ..., 124, 125, 126, 127]

In [1082]:
pca, X_pca, loadings = apply_pca(X_train)
print(loadings)

                            PC1       PC2       PC3       PC4       PC5  \
total_sales            0.000341  0.460295 -0.021733 -0.098130 -0.333706   
sales_per_sq_ft        0.000078  0.419300  0.418745  0.732183  0.335779   
sales_minus_shop_area  0.000341  0.460292 -0.021621 -0.098202 -0.333685   
sales_plus_shop_area   0.000341  0.460298 -0.021846 -0.098058 -0.333726   
sales_times_shop_area  0.000543  0.433988 -0.364258 -0.376370  0.733018   
shop_area_sq_ft        0.000532 -0.015061 -0.830994  0.541631 -0.125927   
shop_id                1.000000 -0.000731  0.000630 -0.000041 -0.000016   

                                PC6           PC7  
total_sales           -4.155963e-01 -7.028132e-01  
sales_per_sq_ft       -1.163342e-15 -3.986797e-15  
sales_minus_shop_area  8.164564e-01 -8.517810e-03  
sales_plus_shop_area  -4.008473e-01  7.113234e-01  
sales_times_shop_area  4.006035e-16  3.514355e-16  
shop_area_sq_ft        1.640815e-04 -9.702942e-05  
shop_id                1.695615e-17

In [1804]:
X_train.drop(columns=['sales_plus_shop_area'], axis=1, inplace=True)
X_test.drop(columns=['sales_plus_shop_area'], axis=1, inplace=True)

In [1805]:
X_train.drop(columns=['sales_minus_shop_area'], axis=1, inplace=True)
X_test.drop(columns=['sales_minus_shop_area'], axis=1, inplace=True)


In [1806]:
for clf in classifiers:
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring=lambda clf, X, y: f1_score(y, clf.predict(X), average='macro'))
    print(clf.__class__.__name__)
    print("F1 Score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

LogisticRegression
F1 Score: 0.47 (+/- 0.17)
DecisionTreeClassifier
F1 Score: 0.38 (+/- 0.25)
KNeighborsClassifier
F1 Score: 0.30 (+/- 0.22)
RandomForestClassifier
F1 Score: 0.40 (+/- 0.07)
GaussianNB
F1 Score: 0.52 (+/- 0.29)


In [1808]:
score_classification(dtc, X_train, y_train, X_test, y_test)

DecisionTreeClassifier()
Accuracy: 0.45
Precision: 0.4861111111111111
Recall: 0.45454545454545453
F1 Score: 0.43969102792632203


In [1809]:
X_train.drop(columns=['shop_area_sq_ft'], axis=1, inplace=True)
X_test.drop(columns=['shop_area_sq_ft'], axis=1, inplace=True)
for clf in classifiers:
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring=lambda clf, X, y: f1_score(y, clf.predict(X), average='macro'))
    print(clf.__class__.__name__)
    print("F1 Score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


LogisticRegression
F1 Score: 0.44 (+/- 0.24)
DecisionTreeClassifier
F1 Score: 0.40 (+/- 0.21)
KNeighborsClassifier
F1 Score: 0.30 (+/- 0.22)
RandomForestClassifier
F1 Score: 0.46 (+/- 0.20)
GaussianNB
F1 Score: 0.49 (+/- 0.23)


In [1810]:
score_classification(knn, X_train, y_train, X_test, y_test)

KNeighborsClassifier(leaf_size=10, metric='euclidean', p=1, weights='distance')
Accuracy: 0.4
Precision: 0.4490740740740741
Recall: 0.44949494949494945
F1 Score: 0.37806637806637816


In [1811]:
X_train.drop(columns=['order_count', 'unique_items'], axis=1, inplace=True)
X_test.drop(columns=['order_count', 'unique_items'], axis=1, inplace=True)
for clf in classifiers:
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring=lambda clf, X, y: f1_score(y, clf.predict(X), average='macro'))
    print(clf.__class__.__name__)
    print("F1 Score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

LogisticRegression
F1 Score: 0.44 (+/- 0.14)
DecisionTreeClassifier
F1 Score: 0.40 (+/- 0.24)
KNeighborsClassifier
F1 Score: 0.31 (+/- 0.21)
RandomForestClassifier
F1 Score: 0.43 (+/- 0.24)
GaussianNB
F1 Score: 0.52 (+/- 0.29)


In [1812]:
score_classification(knn, X_train, y_train, X_test, y_test)

KNeighborsClassifier(leaf_size=10, metric='euclidean', p=1, weights='distance')
Accuracy: 0.65
Precision: 0.6222222222222222
Recall: 0.6565656565656566
F1 Score: 0.6349206349206349


In [49]:
outputResult(logr, X_train, y_train, scaled_X_submission, le)

LogisticRegression()
