In [1]:
import pandas as pd
from sklearn.preprocessing import scale
from sklearn.neighbors import KNeighborsRegressor
import pickle

# Load dataset
ecomm = pd.read_csv("Train.csv")

# Rename columns
cols=[]
for i in ecomm.columns[1:-1]:
    i = i.lower()
    cols.append(i)
cols = ['ID'] +  cols
cols.append('arrival')
ecomm.columns = cols

# Data preprocessing
ecomm['gender'] = ecomm.gender.map({'F':0, 'M':1})
ecomm['customer_rating'] = ecomm['customer_rating'].map({5:0, 4:0, 3:0, 2:0, 1:1})
dummy = pd.get_dummies(ecomm[['warehouse_block', 'mode_of_shipment', 'product_importance']])
ecomm1 = pd.DataFrame(scale(ecomm[['cost_of_the_product', 'weight_in_gms', 'discount_offered']]),
                      columns=['cost_of_the_product', 'weight_in_gms', 'discount_offered'])
ecomm_final = pd.concat([ecomm1, dummy, ecomm[['customer_care_calls', 'prior_purchases', 'gender', 'arrival', 'customer_rating']]], axis=1)

# Split data into output and input
X = ecomm_final.drop(['arrival', 'customer_rating'], axis=1)  # inputs
Y = ecomm_final['arrival']  # output (percentage of timely arrival)

# Model building (using KNeighborsRegressor)
KNN_model = KNeighborsRegressor(n_neighbors=11, weights='distance', metric='euclidean')
KNN_model.fit(X, Y)

# Save the model
filename = 'finalized_knn_regression.pkl'
pickle.dump(KNN_model, open(filename, 'wb'))


In [2]:
from sklearn.metrics import mean_absolute_error

# Assuming y_true are the actual values and y_pred are the predicted values
y_true = Y  # Actual values of 'arrival'
y_pred = KNN_model.predict(X)  # Predicted values of 'arrival'

mae = mean_absolute_error(y_true, y_pred)
print(f'Mean Absolute Error: {mae}')


Mean Absolute Error: 5.081284837900156e-08


In [3]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_true, y_pred)
print(f'Mean Squared Error: {mse}')


Mean Squared Error: 5.572986799055898e-14


In [4]:
from sklearn.metrics import r2_score

r2 = r2_score(y_true, y_pred)
print(f'R^2 Score: {r2}')


R^2 Score: 0.9999999999997684


In [5]:
# Assuming X.shape[1] is the number of predictors
adjusted_r2 = 1 - (1 - r2) * (len(y_true) - 1) / (len(y_true) - X.shape[1] - 1)
print(f'Adjusted R^2 Score: {adjusted_r2}')


Adjusted R^2 Score: 0.9999999999997681


In [6]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Train your model on the training data
KNN_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_test = KNN_model.predict(X_test)

# Calculate metrics on the test set
mae_test = mean_absolute_error(y_test, y_pred_test)
mse_test = mean_squared_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)
adjusted_r2_test = 1 - (1 - r2_test) * (len(y_test) - 1) / (len(y_test) - X_test.shape[1] - 1)

print(f'Test Set Metrics:')
print(f'Mean Absolute Error: {mae_test}')
print(f'Mean Squared Error: {mse_test}')
print(f'R^2 Score: {r2_test}')
print(f'Adjusted R^2 Score: {adjusted_r2_test}')


Test Set Metrics:
Mean Absolute Error: 0.3779565561679475
Mean Squared Error: 0.20759142030522146
R^2 Score: 0.13975686613388827
Adjusted R^2 Score: 0.13305469689661786
