# Preliminaries

In [1190]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

In [1191]:
# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

warnings.filterwarnings('ignore')

In [1192]:
path = "../Datasets/"
Train = pd.read_csv(path + '/Historical-transaction-data.csv')
StoreInfo = pd.read_csv(path + '/Store-info.csv')
Testing = pd.read_csv(path + '/Testing-data.csv')

In [1193]:
Train = Train.merge(StoreInfo, on='shop_id', how='left')

In [1194]:
Testing.drop("shop_profile", axis=1, inplace=True)


In [1195]:
Submission_testing = Testing.copy()
Submission_testing = Submission_testing.merge(Train, on='shop_id', how='left')

In [1196]:
Submission_testing.drop("shop_profile", axis=1, inplace=True)

In [1197]:
# Train.to_csv('CombinedData.csv', index=False)
# Submission_testing.to_csv('SubmissionData.csv', index=False)

In [1198]:
redundant_cols = ['customer_id', 'transaction_date']

Train.drop(redundant_cols, axis=1, inplace=True)
Submission_testing.drop(redundant_cols, axis=1, inplace=True)

In [1199]:
features_nom = ['item_description',"shop_id", "shop_profile", "invoice_id"]

for feature in features_nom:
    Train[feature] = Train[feature].astype("category")
    if feature == "shop_profile":
        continue
    Submission_testing[feature] = Submission_testing[feature].astype("category")

# Preprocessing

In [1200]:
Train["shop_id"] = Train["shop_id"].str.replace("SHOP", "").astype(int).astype("category")
Submission_testing["shop_id"] = Submission_testing["shop_id"].str.replace("SHOP", "").astype(int).astype("category")
StoreInfo["shop_id"] = StoreInfo["shop_id"].str.replace("SHOP", "").astype(int).astype("category")

In [1201]:
#Remove rows with null values for item_description or shop_profile
Train = Train.dropna(subset=['shop_profile'], axis=0)

In [1202]:
from sklearn.compose import ColumnTransformer
#import simpleimputer
from sklearn.impute import SimpleImputer

Submission_testing['shop_profile'] = 'default_value'
ct = ColumnTransformer([("SimpleImputer", SimpleImputer(strategy="constant", fill_value="missing"), ["item_description"])], remainder="passthrough")

Train = pd.DataFrame(ct.fit_transform(Train), columns=Train.columns)

Submission_testing = pd.DataFrame(ct.transform(Submission_testing), columns=Train.columns)

In [1203]:
# Replace values in the "item_description" column of the Train dataframe
Train['item_description'] = Train['item_description'].str.lower()
Train.loc[Train['item_description'].str.contains('milk', case=False), 'item_description'] = 'MILK'
Train.loc[Train['item_description'].str.contains('water', case=False), 'item_description'] = 'WATER'
Train.loc[Train['item_description'].str.contains('missing', case=False), 'item_description'] = 'MISSING'
Train.loc[~Train['item_description'].str.contains('MILK|WATER|MISSING', case=False), 'item_description'] = 'DRINK'

# Replace values in the "item_description" column of the Submission_testing dataframe
Submission_testing['item_description'] = Submission_testing['item_description'].str.lower()
Submission_testing.loc[Submission_testing['item_description'].str.contains('milk', case=False), 'item_description'] = 'MILK'
Submission_testing.loc[Submission_testing['item_description'].str.contains('water', case=False), 'item_description'] = 'WATER'
Submission_testing.loc[Submission_testing['item_description'].str.contains('missing', case=False), 'item_description'] = 'MISSING'
Submission_testing.loc[~Submission_testing['item_description'].str.contains('MILK|WATER|MISSING', case=False), 'item_description'] = 'DRINK'

In [1204]:
Train.drop_duplicates(inplace=True)
Submission_testing.drop_duplicates(inplace=True)

In [1205]:
# drop invoice_id column from both dataframes
Train.drop("invoice_id", axis=1, inplace=True)
Submission_testing.drop(["invoice_id", "shop_profile"], axis=1, inplace=True)

In [1206]:
category_columns = ["shop_id", "item_description", "shop_profile"]

for col in Train.columns:
    if col not in category_columns:
        Train[col] = Train[col].astype("int64")
        Submission_testing[col] = Submission_testing[col].astype("int64")
    else:
        Train[col] = Train[col].astype("category")
        if col == "shop_profile":
            continue
        Submission_testing[col] = Submission_testing[col].astype("category")

In [1207]:
# suppose x1 and x3 column should have a minimum of zero, define the acceptable ranges for each column

ranges = {'quantity_sold': (0, np.inf)}

# loop over each column and adjust the values outside the acceptable range

for col, (min_val, max_val) in ranges.items():
    Train[col] = np.clip(Train[col], min_val, max_val)
    Submission_testing[col] = np.clip(Submission_testing[col], min_val, max_val)

In [1208]:
def outlier_thresholds(dataframe, column_name, q1=0.05, q2=0.95):
    quartile1 = dataframe[column_name].quantile(0.05)
    quartile3 = dataframe[column_name].quantile(0.95)
    IQR = quartile3 - quartile1
    up_limit = quartile3 + 1.5*IQR
    low_limit = quartile1 - 1.5*IQR
    return low_limit, up_limit

def check_outlier(dataframe, column_name):
    low_limit, up_limit = outlier_thresholds(dataframe, column_name)
    if dataframe[(dataframe[column_name] < low_limit) | (dataframe[column_name] > up_limit)].any(axis=None):
        return True
    else:
        return False
    
def replace_with_thresholds(dataframe, column_name):
    low_limit, up_limit = outlier_thresholds(dataframe, column_name)
    dataframe.loc[(dataframe[column_name] < low_limit), column_name] = low_limit
    dataframe.loc[(dataframe[column_name] > up_limit), column_name] = up_limit
    
def remove_outliers(df):
    numeric_columns = df.select_dtypes(include=['int64', 'float64', 'int32', 'float32']).columns
    for col in numeric_columns:
        print(col, check_outlier(df, col))
        if check_outlier(df, col):
            replace_with_thresholds(df, col)

In [1209]:
Train.kurt(numeric_only=True)

item_price         363.995483
quantity_sold      374.858178
shop_area_sq_ft      2.973588
dtype: float64

In [1210]:
Train.describe()

Unnamed: 0,item_price,quantity_sold,shop_area_sq_ft
count,351762.0,351762.0,351762.0
mean,209.261958,1.945378,632.962836
std,169.497265,1.667743,123.672885
min,35.0,0.0,298.0
25%,100.0,1.0,605.0
50%,220.0,2.0,617.0
75%,220.0,2.0,676.0
max,17400.0,101.0,1077.0


In [1211]:
remove_outliers(Train)
remove_outliers(Submission_testing)

item_price True
quantity_sold True
shop_area_sq_ft False
item_price True
quantity_sold True
shop_area_sq_ft False


In [1212]:
Train.kurt(numeric_only=True)

item_price         9.556541
quantity_sold      8.931670
shop_area_sq_ft    2.973588
dtype: float64

In [1213]:
Train.describe()

Unnamed: 0,item_price,quantity_sold,shop_area_sq_ft
count,351762.0,351762.0,351762.0
mean,207.340482,1.902592,632.962836
std,148.884035,1.140813,123.672885
min,35.0,0.0,298.0
25%,100.0,1.0,605.0
50%,220.0,2.0,617.0
75%,220.0,2.0,676.0
max,1102.5,8.5,1077.0


In [1214]:
# Create price-related features
Train['total_sales']= Train['item_price'] * Train['quantity_sold']
Submission_testing['total_sales']= Submission_testing['item_price'] * Submission_testing['quantity_sold']

In [1215]:
# frequency encode the item_description column

item_description_freq = Train.groupby('item_description').size()/len(Train)
# mapping the encoded values to the Train and Submission_testing dataframes
Train['item_description_freq'] = Train['item_description'].map(item_description_freq)
Submission_testing['item_description_freq'] = Submission_testing['item_description'].map(item_description_freq)

In [1216]:
# drop the item_description column and month column from both dataframes
Train.drop(["item_description"], axis=1, inplace=True)
Submission_testing.drop(["item_description"], axis=1, inplace=True)

In [1217]:
Train.head(100)

Unnamed: 0,shop_id,item_price,quantity_sold,shop_area_sq_ft,shop_profile,total_sales,item_description_freq
0,8,220.0,2.0,678,Moderate,440.0,0.846726
1,112,220.0,2.0,668,Moderate,440.0,0.846726
2,8,160.0,2.0,678,Moderate,320.0,0.846726
3,8,150.0,2.0,678,Moderate,300.0,0.846726
4,112,210.0,5.0,668,Moderate,1050.0,0.012887
...,...,...,...,...,...,...,...
109,90,70.0,1.0,730,Moderate,70.0,0.082001
111,47,60.0,1.0,528,Moderate,60.0,0.846726
112,47,220.0,2.0,528,Moderate,440.0,0.846726
113,22,60.0,2.0,735,High,120.0,0.846726


In [1218]:
# create a new feature for the fraction of  entries by each shop_id
train_shop_id_counts = Train['shop_id'].value_counts(normalize=True)
Train['shop_id_counts'] = Train['shop_id'].map(train_shop_id_counts)

submit_shop_id_counts = Submission_testing['shop_id'].value_counts(normalize=True)
Submission_testing['shop_id_counts'] = Submission_testing['shop_id'].map(submit_shop_id_counts)

In [1219]:
# create a feature for the mean item_price by each shop_id

train_shop_price_mean = Train.groupby('shop_id')['item_price'].mean()
Train['shop_price_mean'] = Train['shop_id'].map(train_shop_price_mean)

submit_shop_price_mean = Submission_testing.groupby('shop_id')['item_price'].mean()
Submission_testing['shop_price_mean'] = Submission_testing['shop_id'].map(submit_shop_price_mean)

In [1220]:
Train.to_csv("TrainPP.csv", index=False)
Submission_testing.to_csv("Submission_testingPP.csv", index=False)

# Aggregation

In [1221]:
Train.head()

Unnamed: 0,shop_id,item_price,quantity_sold,shop_area_sq_ft,shop_profile,total_sales,item_description_freq,shop_id_counts,shop_price_mean
0,8,220.0,2.0,678,Moderate,440.0,0.846726,0.000685,211.286307
1,112,220.0,2.0,668,Moderate,440.0,0.846726,0.006249,204.955641
2,8,160.0,2.0,678,Moderate,320.0,0.846726,0.000685,211.286307
3,8,150.0,2.0,678,Moderate,300.0,0.846726,0.000685,211.286307
4,112,210.0,5.0,668,Moderate,1050.0,0.012887,0.006249,204.955641


In [1222]:
# Aggregate X_train_scaled by shop_id and add all the total_sales values and prevent empty values
# Also get the mean of the item_description_freq column
Train["item_description_freq"] = Train["item_description_freq"].astype("float64")
Submission_testing["item_description_freq"] = Submission_testing["item_description_freq"].astype("float64")
Train_cum = Train.groupby(['shop_id'], sort=False).agg({'total_sales':'sum', 'item_description_freq':'mean'})
Submission_testing_cum = Submission_testing.groupby(['shop_id'], sort='False').agg({'total_sales':'sum', 'item_description_freq':'mean'})

In [1223]:
Train_cum = Train_cum.merge(StoreInfo, on='shop_id', how='left')
Train_cum['shop_id_counts'] = Train_cum['shop_id'].map(train_shop_id_counts)
Train_cum['shop_price_mean'] = Train_cum['shop_id'].map(train_shop_price_mean)
Train_cum['shop_price_mean'] = Train_cum['shop_price_mean'].astype("float64")

Submission_testing_cum = Submission_testing_cum.merge(StoreInfo, on='shop_id', how='left')
Submission_testing_cum['shop_id_counts'] = Submission_testing_cum['shop_id'].map(submit_shop_id_counts)
Submission_testing_cum['shop_price_mean'] = Submission_testing_cum['shop_id'].map(submit_shop_price_mean)
Submission_testing_cum['shop_price_mean'] = Submission_testing_cum['shop_price_mean'].astype("float64")
Submission_testing_cum.drop(["shop_profile"], axis=1, inplace=True)

In [1224]:
# # new feature: total_sales per sq ft of the shop
# Train_cum['sales_per_sq_ft'] = Train_cum['total_sales'] / Train_cum['shop_area_sq_ft']
# Submission_testing_cum['sales_per_sq_ft'] = Submission_testing_cum['total_sales'] / Submission_testing_cum['shop_area_sq_ft']

# # new feature : difference between the total_sales and total_sales_per_sq_ft
# Train_cum['sales_minus_shop_area'] = Train_cum['total_sales'] - Train_cum['shop_area_sq_ft']
# Submission_testing_cum['sales_minus_shop_area'] = Submission_testing_cum['total_sales'] - Submission_testing_cum['shop_area_sq_ft']

# # new feature: addition of the total_sales and total_sales_per_sq_ft
# Train_cum['sales_plus_shop_area'] = Train_cum['total_sales'] + Train_cum['shop_area_sq_ft']
# Submission_testing_cum['sales_plus_shop_area'] = Submission_testing_cum['total_sales'] + Submission_testing_cum['shop_area_sq_ft']

# # new feature : multiplication of the total_sales and total_sales_per_sq_ft
# Train_cum['sales_times_shop_area'] = Train_cum['total_sales'] * Train_cum['shop_area_sq_ft']
# Submission_testing_cum['sales_times_shop_area'] = Submission_testing_cum['total_sales'] * Submission_testing_cum['shop_area_sq_ft']

In [1225]:
y_train = Train_cum["shop_profile"]
Train_cum.drop("shop_profile", axis=1, inplace=True)


In [1226]:
Train_cum.describe()

Unnamed: 0,total_sales,item_description_freq,shop_area_sq_ft,shop_price_mean
count,100.0,100.0,100.0,100.0
mean,1720947.0,0.727512,628.29,207.70522
std,553557.5,0.029402,128.999201,15.787755
min,116017.5,0.572841,298.0,149.130647
25%,1364278.0,0.714207,573.25,197.221343
50%,1699167.0,0.730701,617.0,207.379768
75%,2089088.0,0.746506,676.0,215.904579
max,3503605.0,0.777821,1077.0,260.227509


In [1227]:
Submission_testing_cum.describe()

Unnamed: 0,total_sales,item_description_freq,shop_area_sq_ft,shop_price_mean
count,24.0,24.0,24.0,24.0
mean,1743589.0,0.731226,585.416667,217.52951
std,465063.1,0.034623,112.998236,14.337272
min,970830.0,0.629791,310.0,194.935012
25%,1308815.0,0.715052,529.25,209.509526
50%,1722632.0,0.736972,607.0,215.814187
75%,2077289.0,0.752665,676.0,228.69399
max,2636980.0,0.779407,774.0,241.754298


# Feature Scaling

In [1228]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler

scaled_columns = ["total_sales", "item_description_freq", "shop_area_sq_ft", "shop_id_counts" , "shop_price_mean"]
ct = ColumnTransformer([("MinMaxScaler", MinMaxScaler(feature_range=(0, 100)), scaled_columns)], remainder="passthrough")

scaled_X_train = pd.DataFrame(ct.fit_transform(Train_cum), columns=Train_cum.columns)
scaled_X_submission = pd.DataFrame(ct.transform(Submission_testing_cum), columns=Train_cum.columns)

In [1229]:
# set scaled_X_train shop_id as categorical and other columns as numerical

category_columns = ["shop_id"]

for column in category_columns:
    scaled_X_train[column] = scaled_X_train[column].astype("category")
    scaled_X_submission[column] = scaled_X_submission[column].astype("category")

In [1230]:
scaled_X_train.describe()

Unnamed: 0,total_sales,item_description_freq,shop_area_sq_ft,shop_id_counts,shop_price_mean
count,100.0,100.0,100.0,100.0,100.0
mean,75.456409,42.39923,56.994608,52.723877,63.41
std,14.343828,16.559589,18.094525,14.210802,37.976094
min,0.0,0.0,0.0,0.0,1.0
25%,68.965602,35.333761,46.160202,43.287178,31.75
50%,77.01228,40.949936,56.435902,52.430932,62.5
75%,84.722938,48.523748,67.246478,60.104247,95.75
max,100.0,100.0,100.0,100.0,127.0


In [1231]:
scaled_X_submission.describe()

Unnamed: 0,total_sales,item_description_freq,shop_area_sq_ft,shop_id_counts,shop_price_mean
count,24.0,24.0,24.0,24.0,24.0
mean,77.268282,36.895593,250.752305,61.566872,62.666667
std,16.891068,14.50555,66.01839,12.9052,32.095871
min,27.783031,1.540436,134.800148,41.229215,2.0
25%,69.377643,29.685494,211.893757,54.347961,36.0
50%,80.071591,39.666239,257.093212,60.022883,65.5
75%,87.727349,48.523748,281.409131,71.616193,88.25
max,100.773547,61.103979,396.123959,83.371978,114.0


In [1232]:
## Splitting the dataset into the Training set and Test set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(scaled_X_train, y_train, test_size = 0.2, random_state = 1)

In [1233]:
# import labelencoder
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train = le.fit_transform(y_train)

In [1234]:
# y_train = pd.DataFrame(y_train)
# y_test = pd.DataFrame(y_test)

# Metrics

In [1245]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder

def score_classification(model, df1_x, df1_y, df2_x, df2_y):
  print(model)
  df1_x = df1_x.copy()
  df2_x = df2_x.copy() 
  ct2 = ColumnTransformer(transformers=[('encoder', OneHotEncoder(handle_unknown='ignore'), [0])], remainder='passthrough')

  # One hot encoding for the item_description column

  df1_encoded = ct2.fit_transform(df1_x[["shop_id"]])
  df1_x = pd.concat([df1_x.drop("shop_id", axis=1).reset_index(drop=True), pd.DataFrame(df1_encoded.toarray())], axis='columns')

  df2_encoded = ct2.transform(df2_x[["shop_id"]])
  df2_x = pd.concat([df2_x.drop("shop_id", axis=1).reset_index(drop=True), pd.DataFrame(df2_encoded.toarray())], axis='columns')

  # remove the dummy variable trap
  df1_x = df1_x.drop([0], axis=1)
  df2_x = df2_x.drop([0], axis=1)
  
  df1_x.columns = df1_x.columns.astype(str)
  df2_x.columns = df2_x.columns.astype(str)
  # Fit the model  
  model.fit(df1_x, df1_y)

  # Make predictions
  y_pred = model.predict(df2_x)
  print(df2_y)
  print(y_pred)
  # Evaluate the model
  # accuracy = accuracy_score(df2_y, y_pred)
  # precision = precision_score(df2_y, y_pred,  average='macro')
  # recall = recall_score(df2_y, y_pred,  average='macro')
  # f1 = f1_score(df2_y, y_pred,  average='macro')

  # print(f"Accuracy: {accuracy}")
  # print(f"Precision: {precision}")
  # print(f"Recall: {recall}")
  # print(f"F1 Score: {f1}")

In [1236]:
def outputResult(model, df1_x, df1_y, test, le):
    print(model)
    df1_x = df1_x.copy()
    df2_x = test.copy()
    
    ct2 = ColumnTransformer(transformers=[('encoder', OneHotEncoder(handle_unknown='ignore'), [0])], remainder='passthrough')

    # One hot encoding for the item_description column

    df1_encoded = ct2.fit_transform(df1_x[["shop_id"]])
    df1_x = pd.concat([df1_x.drop("shop_id", axis=1).reset_index(drop=True), pd.DataFrame(df1_encoded.toarray())], axis='columns')

    df2_encoded = ct2.transform(df2_x[["shop_id"]])
    df2_x = pd.concat([df2_x.drop("shop_id", axis=1).reset_index(drop=True), pd.DataFrame(df2_encoded.toarray())], axis='columns')

  # Drop the first column to avoid dummy variable trap
    df1_x = df1_x.drop([0], axis=1)
    df2_x = df2_x.drop([0], axis=1)

    # remove the dummy variable trap
    df1_x = df1_x.drop([0], axis=1)
    df2_x = df2_x.drop([0], axis=1)
  
    # Fit the model on the training data
    model.fit(df1_x, df1_y)
    
    # Make predictions on the test data
    y_pred = model.predict(df2_x)
    
    # Inverse transform the encoded predictions to the original shop profiles
    y_pred = le.inverse_transform(y_pred)    
    
    # Create a dataframe with the predicted shop profiles
    results_df = pd.DataFrame({'shop_id': test.shop_id, 'shop_profile': y_pred})    
    # rename the shop_id column with the word "SHOP" and add a 0 in front of the shop_id
    results_df["shop_id"] = results_df["shop_id"].astype(int).astype(str)
    results_df["shop_id"] = "SHOP" + results_df["shop_id"].str.zfill(3)

    # results_df.drop_duplicates(inplace=True)
    
    # Access the global testing dataframe
    results_final = pd.merge(Testing, results_df, on="shop_id", how="left")
    # Write the dataframe to a CSV file
    results_final.to_csv('predictions'+str(model).strip("()")+'.csv', index=False)

# Modelling

In [1237]:
from sklearn.linear_model import LogisticRegression

logr = LogisticRegression()

In [1242]:
score_classification(logr, X_train, y_train, X_test, y_test)

LogisticRegression()


ValueError: Mix of label input types (string and number)

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()


In [None]:

score_classification(dtc, X_train, y_train, X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rfc = RandomForestClassifier()

In [None]:
# Define the hyperparameters to test
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}




In [None]:

# # Use grid search cross-validation to find the best hyperparameters
# grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5, n_jobs=-1)
# grid_search.fit(X_train, y_train)

# # Print the best hyperparameters and the corresponding accuracy score
# print("Best Hyperparameters: ", grid_search.best_params_)
# print("Best Accuracy Score: ", grid_search.best_score_)

In [None]:
rfc.set_params(max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=100)

In [None]:
score_classification(rfc, X_train, y_train, X_test, y_test)

In [None]:
from sklearn.svm import SVC
svm = SVC(kernel='linear', C=1)

In [None]:
score_classification(svm, X_train, y_train, X_test, y_test)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=1)

In [None]:
score_classification(gbm, X_train, y_train, X_test, y_test)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

In [None]:
score_classification(knn, X_train, y_train, X_test, y_test)

In [None]:

from sklearn.pipeline import Pipeline
import numpy as np

# Define the parameter grid to search over
param_grid = {
    'knn__n_neighbors': [3, 5, 7, 9, 11],
    'knn__weights': ['uniform', 'distance'],
    'knn__p': [1, 2],
    'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'knn__leaf_size': [10, 20, 30, 40, 50],
    'knn__metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski']
}

# Create a pipeline to preprocess the data and apply KNN
pipeline = Pipeline([ 
    ('knn', KNeighborsClassifier())
])

# Create a grid search object to find the best parameters
grid_search_knn = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)



In [None]:
# # Fit the grid search object on the data
# grid_search_knn.fit(X_train, y_train)

# # Print the best parameters found
# print("Best parameters:", grid_search_knn.best_params_)


In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=False).split(range(25))

from sklearn.model_selection import cross_val_score

knn2 = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='f1_macro')
print(scores)

In [None]:
# Set the best parameters found by GridSearchCV
knn.set_params(**{'algorithm': 'auto', 'leaf_size': 10, 'metric': 'euclidean', 'n_neighbors': 5, 'p': 1, 'weights': 'distance'})

score_classification(knn, X_train, y_train, X_test, y_test)

In [None]:
scores = cross_val_score(rfc, X_train, y_train, cv=10, scoring='accuracy')
print(scores)

In [None]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()


In [None]:

score_classification(nb, X_train, y_train, X_test, y_test)

In [None]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, max_depth=5)

In [None]:
score_classification(xgb_model, X_train, y_train, X_test, y_test)

In [None]:
X_train.info()

In [None]:

# score_classification(xgb_model, X_train, y_train, X_test, y_test)

In [None]:
# Create a list of classifiers to compare
classifiers = [logr, dtc, knn, rfc, nb, svm, gbm]

# Create X and y data

# Cross-validation
for clf in classifiers:
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring=lambda clf, X, y: f1_score(y, clf.predict(X), average='macro'))
    print(clf.__class__.__name__)
    print("F1 Score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


In [None]:
X_train.head()

# Feature Engineering

In [None]:
from sklearn.feature_selection import mutual_info_regression

def make_mi_scores(X,y):
  X = X.copy()
  for colname in X.select_dtypes(["object", "category"]):
    X[colname], _ = X[colname].factorize()
  # all discrete features should now have integer dtypes
  discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
  mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
  mi_scores = pd.Series(mi_scores, name="Mutual Information Scores", index=X.columns)
  mi_scores = mi_scores.sort_values(ascending=False)
  return mi_scores


def plot_mi_scores(scores):
  scores = scores.sort_values(ascending=True)
  width = np.arange(len(scores))
  ticks = list(scores.index)
  plt.barh(width,scores)
  plt.yticks(width, ticks)
  plt.title("Mututal Information Scores")

In [None]:
# no of null values in each column
# print(y_train.isnull().sum())

In [None]:
# y_train.shape

In [None]:
# mi_scores = make_mi_scores(X_train, y_train)
# mi_scores

In [None]:
from sklearn.decomposition import PCA

def apply_pca(X):
  pca= PCA()
  df1_x = X.copy()

  X_pca = pca.fit_transform(df1_x)
  component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
  X_pca = pd.DataFrame(X_pca, columns=component_names)
  #create loadings
  loadings = pd.DataFrame(
  pca.components_.T,
  columns = component_names,
  index=df1_x.columns,
  )
  return pca, X_pca, loadings

def plot_variance(pca, width=8, dpi=100):
  fig, axs = plt.subplots(1,2)
  n = pca.n_components_
  grid = np.arange(1, n+1)
  evr = pca.explained_variance_ratio_
  axs[0].bar(grid,evr)
  axs[0].set(
      xlabel="Component",title="% Explained Variance", ylim=(0.0,1.0)             )
  #Cumulative Variance
  cv = np.cumsum(evr)
  axs[1].plot(np.r_[0,grid], np.r_[0,cv], "o-")
  axs[1].set(xlabel="Component", title="% Cumulatve Variance", ylim=(0.0, 1.0))
  fig.set(figwidth=8, dpi=100)
  return axs

In [None]:
X_train["shop_id"].astype("int").astype("category")

In [None]:
 # plot a pairplot for all the features
y_train_df = pd.DataFrame(y_train, columns=['target'])
# X_train.index = range(len(X_train))
# combine X_train and y_train into a single dataframe
df_concat = pd.concat([X_train, y_train_df], axis=1)

In [None]:

# create the pairplot
# sns.pairplot(df_concat, hue='target')

In [None]:
from sklearn.cluster import KMeans

def cluster_labels(df, features, n_clusters=6):
  X = df.copy()
  X_new = X.loc[:, features]
  kmeans = KMeans(n_clusters=n_clusters, n_init=100, random_state=0)
  X_new["Cluster"] = kmeans.fit_predict(X_new)
  X_new["Cluster"] = X_new.Cluster.astype("category")
  return X_new

In [None]:
def plot_cluster(X, y):
  Xy = X.copy()
  Xy["y"] = y
  Xy.head()
  sns.relplot(
      x="value", y="y", hue="Cluster", col="variable",
      height=4, aspect=1, facet_kws={'sharex':False}, col_wrap=3,
      data=Xy.melt(
          value_vars=features, id_vars=["y", "Cluster"]
      )
  )

In [None]:
features = X_train.columns
cluster_df1_x = cluster_labels(X_train, features, n_clusters=3)
cluster_df2_x = cluster_labels(X_test, features, n_clusters=3)
# cluster_df3_x = cluster_labels(Submission_testing, features, n_clusters=3)
cluster_df1_x.head()

In [None]:
plot_cluster(cluster_df1_x, y_train)

In [None]:
X_train.info()

In [None]:
score_classification(rfc, cluster_df1_x, y_train, cluster_df2_x, y_test)

In [None]:
# cluster_df1_x.info()

In [None]:
# cluster_df1_x.drop(['shop_area_sq_ft'], axis=1, inplace=True)
# cluster_df2_x.drop(['shop_area_sq_ft'], axis=1, inplace=True)

In [None]:
# cluster_df1_x.drop(columns=['sales_plus_shop_area', 'sales_minus_shop_area', 'unique_items'], axis=1, inplace=True)
# cluster_df2_x.drop(columns=['sales_plus_shop_area', 'sales_minus_shop_area', 'unique_items'], axis=1, inplace=True)

In [None]:
score_classification(knn, cluster_df1_x, y_train, cluster_df2_x, y_test)

In [None]:
# pca, X_pca, loadings = apply_pca(X_train)
# print(loadings)

In [None]:
score_classification(gbm, X_train, y_train, X_test, y_test)

In [None]:
X_train.drop(columns=['sales_plus_shop_area'], axis=1, inplace=True)
X_test.drop(columns=['sales_plus_shop_area'], axis=1, inplace=True)

In [None]:
score_classification(gbm, X_train, y_train, X_test, y_test)

In [None]:
X_train.drop(columns=['sales_minus_shop_area'], axis=1, inplace=True)
X_test.drop(columns=['sales_minus_shop_area'], axis=1, inplace=True)


In [None]:
score_classification(dtc, X_train, y_train, X_test, y_test)

In [None]:
for clf in classifiers:
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring=lambda clf, X, y: f1_score(y, clf.predict(X), average='macro'))
    print(clf.__class__.__name__)
    print("F1 Score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
score_classification(dtc, X_train, y_train, X_test, y_test)

In [None]:
X_train.info()

In [None]:
X_train.drop(columns=['unique_items', 'order_count'], axis=1, inplace=True)
X_test.drop(columns=['unique_items', 'order_count'], axis=1, inplace=True)

In [None]:
score_classification(dtc, X_train, y_train, X_test, y_test)

In [None]:
score_classification(rfc, X_train, y_train, X_test, y_test)

In [None]:
scaled_X_submission = scaled_X_submission[X_train.columns]
outputResult(dtc, X_train, y_train, scaled_X_submission, le)

In [None]:
score_classification(rfc, X_train, y_train, X_test, y_test)

In [None]:
X_train.drop(columns=['shop_area_sq_ft'], axis=1, inplace=True)
X_test.drop(columns=['shop_area_sq_ft'], axis=1, inplace=True)
for clf in classifiers:
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring=lambda clf, X, y: f1_score(y, clf.predict(X), average='macro'))
    print(clf.__class__.__name__)
    print("F1 Score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


In [None]:
score_classification(knn, X_train, y_train, X_test, y_test)

In [None]:
X_train.drop(columns=['order_count', 'unique_items'], axis=1, inplace=True)
X_test.drop(columns=['order_count', 'unique_items'], axis=1, inplace=True)


In [None]:
for clf in classifiers:
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring=lambda clf, X, y: f1_score(y, clf.predict(X), average='macro'))
    print(clf.__class__.__name__)
    print("F1 Score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
# scaled_X_submission = scaled_X_submission[X_train.columns]
# outputResult(gbm, X_train, y_train, scaled_X_submission, le)

In [None]:
# Define the hyperparameters to test
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}



In [None]:
# Use grid search cross-validation to find the best hyperparameters
grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and the corresponding accuracy score
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best Accuracy Score: ", grid_search.best_score_)

In [None]:
# rfc.set_params(**grid_search.best_params_)

In [None]:
score_classification(rfc, X_train, y_train, X_test, y_test)

In [None]:
# Fit the grid search object on the data
# grid_search_knn.fit(X_train, y_train)


In [None]:

# Print the best parameters found
# print("Best parameters:", grid_search_knn.best_params_)

In [None]:
# knn.set_params(**{'algorithm': 'auto', 'leaf_size': 10, 'metric': 'euclidean', 'n_neighbors': 11, 'p': 1, 'weights': 'uniform'})
knn.set_params(**{'algorithm': 'auto', 'leaf_size': 10, 'metric': 'euclidean', 'n_neighbors': 5, 'p': 1, 'weights': 'distance'})


In [None]:
X_train.head()

In [None]:
score_classification(dtc, X_train, y_train, X_test, y_test)

In [None]:
score_classification(knn, X_train, y_train, X_test, y_test)

In [None]:
score_classification(nb, X_train, y_train, X_test, y_test)

In [None]:
score_classification(dtc, X_train, y_train, X_test, y_test)

In [None]:
scaled_X_submission = scaled_X_submission[X_train.columns]

In [None]:
outputResult(knn, X_train, y_train, scaled_X_submission, le)