In [165]:
import numpy as np # type: ignore
import matplotlib.pyplot as plt # type: ignore
import pandas as pd # type: ignore
from sklearn.model_selection import train_test_split # type: ignore

In [166]:
data = pd.read_csv("cleaned_data.csv")
X =data.drop('Discount Price', axis=1)
y = data['Discount Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train.shape

(132, 187)

In [167]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error

linear_reggression_model = LinearRegression()
linear_reggression_model.fit(X_train, y_train)
yhat = linear_reggression_model.predict(X_test)
print("Linear Regression")
print("Mean Absolute Percentage Error: ", mean_absolute_percentage_error(y_test, yhat))
print("Mean Absolute Error: ", mean_absolute_error(y_test, yhat))
first_element_prediction = linear_reggression_model.predict([X_test.iloc[0]])
print("Prediction for the first element in X_train: ", first_element_prediction)
print("Actual value for the first element in X_train: ", y_test.iloc[0])




Linear Regression
Mean Absolute Percentage Error:  684.467552085746
Mean Absolute Error:  2000430.5613598505
Prediction for the first element in X_train:  [188708.58694787]
Actual value for the first element in X_train:  4700.0000000172




In [168]:
from sklearn.tree import DecisionTreeRegressor
decision_tree_model = DecisionTreeRegressor()
decision_tree_model.fit(X_train, y_train)
yhat = decision_tree_model.predict(X_test)
print("Decision Tree")
print("Mean Absolute Percentage Error: ", mean_absolute_percentage_error(y_test, yhat))
print("Mean Absolute Error: ", mean_absolute_error(y_test, yhat))
first_element_prediction = decision_tree_model.predict([X_test.iloc[0]])
print("Prediction for the first element in X_train: ", first_element_prediction)
print("Actual value for the first element in X_train: ", y_test.iloc[0])


Decision Tree
Mean Absolute Percentage Error:  0.1137819797803721
Mean Absolute Error:  436.88235298173254
Prediction for the first element in X_train:  [4600.0000001]
Actual value for the first element in X_train:  4700.0000000172




In [169]:
from sklearn.ensemble import RandomForestRegressor
random_forest_model = RandomForestRegressor()
random_forest_model.fit(X_train, y_train)
yhat = random_forest_model.predict(X_test)

print("Random Forest")

print("Mean Absolute Percentage Error: ", mean_absolute_percentage_error(y_test, yhat))
print("Mean Absolute Error: ", mean_absolute_error(y_test, yhat))
first_element_prediction = random_forest_model.predict([X_test.iloc[0]])
print("Prediction for the first element in X_train: ", first_element_prediction)
print("Actual value for the first element in X_train: ", y_test.iloc[0])


Random Forest
Mean Absolute Percentage Error:  0.16720855379213906
Mean Absolute Error:  428.36558826223035




Prediction for the first element in X_train:  [4363.99]
Actual value for the first element in X_train:  4700.0000000172


In [170]:
import xgboost as xgb
xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)
yhat = xgb_model.predict(X_test)

print("XGBoost")

print("Mean Absolute Percentage Error: ", mean_absolute_percentage_error(y_test, yhat))
print("Mean Absolute Error: ", mean_absolute_error(y_test, yhat))
first_element_prediction = xgb_model.predict([X_test.iloc[0]])
print("Prediction for the first element in X_train: ", first_element_prediction)
print("Actual value for the first element in X_train: ", y_test.iloc[0])

XGBoost
Mean Absolute Percentage Error:  0.12420990232561856
Mean Absolute Error:  363.31589465830206
Prediction for the first element in X_train:  [4420.5996]
Actual value for the first element in X_train:  4700.0000000172


In [171]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(linear_reggression_model, X, y, cv=5, scoring='r2')
print("Linear Regression")
print("Cross-validation scores: ", scores, " Mean: ", scores.mean(), " Std: ", scores.std())



Linear Regression
Cross-validation scores:  [-3.54158220e+02  7.68098529e-01 -2.52652618e+06 -7.45459274e+05
 -1.64219573e+06]  Mean:  -982906.9145276544  Std:  980338.8174321357


In [172]:
scores = cross_val_score(decision_tree_model, X, y, cv=5, scoring='r2')
print("Decision Tree")
print("Cross-validation scores: ", scores, " Mean: ", scores.mean(), " Std: ", scores.std())



Decision Tree
Cross-validation scores:  [0.79627266 0.89824348 0.81682884 0.6471201  0.47768503]  Mean:  0.7272300221554838  Std:  0.14881582071284546


In [173]:
scores = cross_val_score(random_forest_model, X, y, cv=5 , scoring='r2')
print("Random Forest")
print("Cross-validation scores: " ,  scores , " Mean: ", scores.mean() , " Std: ", scores.std()) 


Random Forest
Cross-validation scores:  [0.88490785 0.87453254 0.85856949 0.6940737  0.80900009]  Mean:  0.8242167332766348  Std:  0.07009131243938725


In [174]:
scores = cross_val_score(xgb_model, X, y, cv=5 , scoring='r2')
print("XGBoost")
print("Cross-validation scores: " ,  scores , " Mean: ", scores.mean() , " Std: ", scores.std()) 


XGBoost
Cross-validation scores:  [0.90190785 0.89437504 0.91555352 0.79630468 0.57973451]  Mean:  0.817575121258811  Std:  0.1261987263368479


### tuning the  XGBoost

In [175]:
from sklearn.model_selection import GridSearchCV , RandomizedSearchCV
# GridSearchCV it's way to test all the possible combinations of the hyperparameters , it's very slow , but it's guaranteed to find the best hyperparameters
# RandomizedSearchCV it's way to test random combinations of the hyperparameters , it's faster than GridSearchCV , but it's not guaranteed to find the best hyperparameters  
param_grid = {
          'n_estimators': [100, 300, 500],
          'max_depth': [3 ,4 , 5,],
          'learning_rate': [0.01, 0.1, 0.2] ,
          
          }
grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_xgb_model = xgb.XGBRegressor(**best_params)
best_xgb_model.fit(X_train, y_train)

print("XGBoost Grid Search")

print("Mean Absolute Percentage Error: ", mean_absolute_percentage_error(y_test, yhat))
print("Mean Absolute Error: ", mean_absolute_error(y_test, yhat))
first_element_prediction = best_xgb_model.predict([X_test.iloc[0]])
print("Prediction for the first element in X_train: ", first_element_prediction)
print("Actual value for the first element in X_train: ", y_test.iloc[0])

scores = cross_val_score(best_xgb_model, X, y, cv=5 , scoring='r2')
print("XGBoost")
print("Cross-validation scores: " ,  scores , " Mean: ", scores.mean() , " Std: ", scores.std()) 


XGBoost Grid Search
Mean Absolute Percentage Error:  0.12420990232561856
Mean Absolute Error:  363.31589465830206
Prediction for the first element in X_train:  [4292.8687]
Actual value for the first element in X_train:  4700.0000000172
XGBoost
Cross-validation scores:  [0.9772534  0.91938928 0.94803202 0.81162309 0.65720379]  Mean:  0.8627003156856906  Std:  0.11700876483126152
