# Model
## importing libraries

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
import os
os.chdir("/content/drive/MyDrive/Colab Notebooks")

## import dataset

In [10]:
df = pd.read_csv('data4.csv')

In [11]:
X = df.drop(['Volume Sold (Liters)'], axis=1)
y = df['Volume Sold (Liters)']

spliting the data

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [13]:
linear_r_model = LinearRegression()
knn_r_model = KNeighborsRegressor()
rf_r_model = RandomForestRegressor()
dt_r_model = DecisionTreeRegressor()

In [19]:
models = list()
models.append(linear_r_model)
models.append(knn_r_model)
models.append(rf_r_model)
models.append(dt_r_model)

In [14]:
def train_regression(X, y, model):
    """
    This function will apply train function to X and y using the specified model.
    Then it will print the mean_squared_error, mean_absolute_error, and root_mean_squared_error on the test subset
    :return:
    """

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_true=y_test, y_pred=y_pred)
    mae = mean_absolute_error(y_true=y_test,y_pred=y_pred)
    rmse = math.sqrt(mse)
    print(f"For the {model} model:")
    print(f"the mean squared error is {mse}")
    print(f"the mean absolute error is {mae}")
    print(f"the root mean squared error is {rmse}")
    return "--------------------------------------------------"

In [15]:
linear_r_model.fit(X_train, y_train)
y_pred = linear_r_model.predict(X_test)

In [16]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_pred, y_test)
print(mse)
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_pred, y_test)
print(mae)

409.7738368832361
8.258305905093751


In [22]:
for model in models:
  train_regression(X, y, model)

For the LinearRegression() model:
the mean squared error is 409.7738368832361
the mean absolute error is 8.258305905093751
the root mean squared error is 20.242871260847263
For the KNeighborsRegressor() model:
the mean squared error is 739.7845956056833
the mean absolute error is 6.480763122860944
the root mean squared error is 27.198981517801055
For the RandomForestRegressor() model:
the mean squared error is 9.866996143020483
the mean absolute error is 0.07120571767129553
the root mean squared error is 3.141177508995708
For the DecisionTreeRegressor() model:
the mean squared error is 19.459015520419726
the mean absolute error is 0.06583916968836426
the root mean squared error is 4.411237413744552


In [23]:
data = pd.DataFrame(columns= ['ML model', 'MSE', 'MAE', 'RMSE'])

In [24]:
for model in models:
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  mse = mean_squared_error(y_true=y_test, y_pred=y_pred)
  mae = mean_absolute_error(y_true=y_test,y_pred=y_pred)
  rmse = math.sqrt(mse)
  new_row = {'ML model' : f'{model}', 'MSE' : mse, 'MAE' : mae, 'RMSE' :rmse}
  data = data.append(new_row, ignore_index=True)

In [25]:
data

Unnamed: 0,ML model,MSE,MAE,RMSE
0,LinearRegression(),409.773837,8.258306,20.242871
1,KNeighborsRegressor(),739.784596,6.480763,27.198982
2,RandomForestRegressor(),10.780307,0.071379,3.283338
3,DecisionTreeRegressor(),19.550262,0.066496,4.421568


### Lime

In [26]:
# pip install lime

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 KB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283857 sha256=5a2abba5b2cb1bbd96e42485ca8287929960e977156f41701d8a9f60ce0a2927
  Stored in directory: /root/.cache/pip/wheels/e6/a6/20/cc1e293fcdb67ede666fed293cb895395e7ecceb4467779546
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1


In [27]:
import lime

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [31]:
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

DecisionTreeRegressor()

In [33]:
from lime import lime_tabular

In [38]:
explainer = lime_tabular.LimeTabularExplainer(X_train, feature_names = ['Store Number', 'Zip Code', 'Item Number', 'Pack', 'Bottle Volume', 'Sale (Dollars)', 'Profit Margin'], class_names = ['Volume Sold (Liters)'], verbose=True, mode='regression')

TypeError: ignored

In [None]:
exp = explainer.explain_instance(X_test[2], model.predict)

In [None]:
exp.show_in_notebook(show_table=True, show_all=False)

In [None]:
exp.save_to_file('lime.html')
