##### This Juptyer notebook is an adaptation this Collab Notebook [here](https://colab.research.google.com/drive/16bHT5fOLXCO-MAIIqGdDUh7QxGirIf0x?usp=sharing)
##### The dataset originally coming from [here](https://www.kaggle.com/datasets/vivek468/superstore-dataset-final)

In [1]:
import pandas as pd
import numpy as np

In [6]:
superstore_data =  pd.read_excel('data/Superstore/Superstore.xlsx')

In [7]:
na_count = superstore_data.isna().sum()
print(na_count)

Row ID           0
Order ID         0
Order Date       0
Ship Date        0
Ship Mode        0
Customer ID      0
Customer Name    0
Segment          0
Country          0
City             0
State            0
Postal Code      0
Region           0
Product ID       0
Category         0
Sub-Category     0
Product Name     0
Sales            0
Quantity         0
Discount         0
Profit           0
dtype: int64


In [8]:
data_columns = superstore_data.columns
data_columns

Index(['Row ID', 'Order ID', 'Order Date', 'Ship Date', 'Ship Mode',
       'Customer ID', 'Customer Name', 'Segment', 'Country', 'City', 'State',
       'Postal Code', 'Region', 'Product ID', 'Category', 'Sub-Category',
       'Product Name', 'Sales', 'Quantity', 'Discount', 'Profit'],
      dtype='object')

In [9]:
superstore_data['Discount_Rate'] = superstore_data['Discount'] / superstore_data['Sales']
superstore_data['Discount_Rate']

0       0.000000
1       0.000000
2       0.000000
3       0.000470
4       0.008941
          ...   
9989    0.007921
9990    0.000000
9991    0.000773
9992    0.000000
9993    0.000000
Name: Discount_Rate, Length: 9994, dtype: float64

In [12]:
# randomly take 125 data from the dataset
# 100 for training and 25 for testing
sample_data = superstore_data.sample(n=125, random_state=1)
sample_data

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit,Discount_Rate
1814,1815,CA-2012-131597,2012-09-14,2012-09-18,Standard Class,SP-20620,Stefania Perrino,Corporate,United States,Los Angeles,...,West,FUR-TA-10002607,Furniture,Tables,KI Conference Tables,170.136,3,0.2,-8.5068,0.001176
881,882,CA-2012-129098,2012-10-09,2012-10-13,Standard Class,DK-13090,Dave Kipp,Consumer,United States,Springfield,...,South,OFF-ST-10001321,Office Supplies,Storage,"Decoflex Hanging Personal Folder File, Blue",30.840,2,0.0,8.3268,0.000000
1122,1123,US-2011-147627,2011-01-21,2011-01-27,Standard Class,HL-15040,Hunter Lopez,Consumer,United States,Jonesboro,...,South,OFF-AR-10002375,Office Supplies,Art,Newell 351,22.960,7,0.0,6.6584,0.000000
6807,6808,CA-2012-128125,2012-03-31,2012-04-05,Standard Class,EB-13705,Ed Braxton,Corporate,United States,Houston,...,Central,OFF-PA-10000357,Office Supplies,Paper,"White Dual Perf Computer Printout Paper, 2700 ...",98.376,3,0.2,35.6613,0.002033
2911,2912,CA-2011-113929,2011-06-16,2011-06-21,Standard Class,CK-12205,Chloris Kastensmidt,Consumer,United States,Hempstead,...,East,OFF-EN-10003286,Office Supplies,Envelopes,Staples,41.400,5,0.0,19.4580,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6516,6517,US-2014-116652,2014-09-16,2014-09-20,Standard Class,RD-19480,Rick Duston,Consumer,United States,San Francisco,...,West,FUR-FU-10001488,Furniture,Furnishings,"Tenex 46"" x 60"" Computer Anti-Static Chairmat,...",529.900,5,0.0,105.9800,0.000000
1463,1464,CA-2013-152289,2013-08-27,2013-08-29,First Class,LC-16930,Linda Cazamias,Corporate,United States,Pasadena,...,Central,FUR-CH-10002126,Furniture,Chairs,Hon Deluxe Fabric Upholstered Stacking Chairs,1024.716,6,0.3,-29.2776,0.000293
254,255,US-2012-159982,2012-11-28,2012-12-04,Standard Class,DR-12880,Dan Reichenbach,Corporate,United States,Chicago,...,Central,FUR-FU-10002505,Furniture,Furnishings,Eldon 100 Class Desk Accessories,12.132,9,0.6,-8.4924,0.049456
8391,8392,CA-2014-110625,2014-12-24,2014-12-31,Standard Class,JB-16045,Julia Barnett,Home Office,United States,Danbury,...,East,FUR-FU-10001473,Furniture,Furnishings,DAX Wood Document Frame,27.460,2,0.0,9.8856,0.000000


In [53]:
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
import json
import os
metric_store = {}

def metric(model_name,y_true, y_pred, store:bool=True, ) -> dict:
    if store:
    metrics_dict = {
        'MAE': mean_absolute_error(y_true, y_pred),
        'MSE': mean_squared_error(y_true, y_pred),
        'R2 Score': r2_score(y_true, y_pred)
    }


    
      

        with open('metrics.json', file_mode) as file:
            if file_mode == 'a':
                file.write('\n')
            json.dump(metrics_dict, file, indent=4,separators=(',', ': '))
            
        
    return metrics_dict

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

categorical_cols = ['Ship Mode', 'Segment', 'Country', 'City', 'State', 'Region', 'Category', 'Sub-Category']
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_features = encoder.fit_transform(sample_data[categorical_cols])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_cols))

# Combine the data
data = pd.concat([sample_data.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)
data.drop(columns=categorical_cols + ['Order ID', 'Order Date', 'Ship Date', 'Customer ID', 'Customer Name', 'Product ID', 'Product Name'], inplace=True)

# Define features and target
X = data.drop(columns=['Profit'])
Y = data['Profit']



In [45]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [54]:
from sklearn.neighbors import KNeighborsRegressor
knn_model = KNeighborsRegressor(n_neighbors=5)
knn_model.fit(X_train, Y_train)

# Make predictions
Y_pred = knn_model.predict(X_test)
knn_model_metrics = metric('KNN', Y_test, Y_pred)
knn_model_metrics

{'Model': 'KNN',
 'MAE': 38.65377625,
 'MSE': 5282.3167964698005,
 'R2 Score': -1.122079393610714}

In [55]:
from sklearn.svm import SVR
svm_linear_model = SVR(kernel='linear')
svm_linear_model.fit(X_train, Y_train)

# Make predictions
Y_pred = svm_linear_model.predict(X_test)
svm_linear_model = metric('SVM Linear', Y_test, Y_pred)
svm_linear_model

{'Model': 'SVM Linear',
 'MAE': 23.02001661255968,
 'MSE': 1500.0550697973836,
 'R2 Score': 0.3973788253242059}