<a href="https://colab.research.google.com/github/Chrisolande/Machine-Learning-and-Data-Science-Projects/blob/main/laptop_prices_analysis(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**About Dataset**

This dataset provides a comprehensive collection of information on various laptops, enabling a detailed analysis of their specifications and pricing. It encompasses a wide range of laptops, encompassing diverse brands, models, and configurations, making it a valuable resource for researchers, data analysts, and machine learning enthusiasts interested in the laptop industry.

The data comes from the spanish website PC componentes. The data was collected using Power Automate, more info on: https://github.com/juanmerino89/laptops-data-cleaning

Fields included:

* Laptop Name: The unique identifier or model name of the laptop.




* Brand: Laptop brand.
* Model: Laptop brand model.
* CPU (Central Processing Unit): The processor brand, model, and other relevant details.
* GPU (Graphics Processing Unit): The graphics card brand, model, and associated specifications.
* RAM (Random Access Memory): The amount of memory available for multitasking.
* Storage: The storage type (HDD, SSD) and capacity of the laptop.
* Price: The cost of the laptop in the respective currency.

By utilizing this dataset, researchers and analysts can explore patterns, trends, and relationships between laptop specifications and their pricing. It serves as an excellent resource for tasks such as price prediction, market analysis, and comparison of different laptop configurations. Whether you are interested in identifying the most cost-effective options or understanding the impact of specific hardware components on laptop prices, this dataset offers abundant possibilities for in-depth exploration.

In [None]:
import warnings

import lightgbm as lgbm
import matplotlib.pyplot as plt
import missingno as msno
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import xgboost as xgb
from google.colab import files
from scipy import stats
from sklearn.ensemble import (
    AdaBoostRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor,
)
from sklearn.linear_model import ElasticNet, Lasso, LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    RandomizedSearchCV,
    cross_val_score,
    train_test_split,
)
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

warnings.filterwarnings("ignore")
%matplotlib inline
sns.set_style("darkgrid")

In [None]:
uploaded = files.upload()

df = pd.read_csv(next(iter(uploaded)))

In [None]:
df.shape

**Observation**

There are 2160 rows and 12 columns in the dataset

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
missing_values = pd.DataFrame(
    {"Missing Values": df.isnull().sum(), "Non Null Values": df.notnull().sum()}
)

missing_values

In [None]:
missing_value_percentage = round(
    (missing_values["Missing Values"].sum() / (df.shape[0] * df.shape[1])) * 100, 2
)

print(
    "The missing values account for",
    missing_value_percentage,
    "% of the entire dataset",
)

**Observation**

There are 1417 missing values accounting for 5.47 %

In [None]:
msno.matrix(df)

**Observation**

The missing values are mode concentrated on the GPU column than any other column

In [None]:
print("There are", df.duplicated().sum(), "duplicated values in the dataset")

In [None]:
df["Storage type"] = df["Storage type"].fillna(df["Storage type"].mode())

df["GPU"] = df["GPU"].fillna("Other")

In [None]:
df.dropna(inplace=True)

In [None]:
df.isnull().sum()

# Exploratory Data Analysis

In [None]:
df["Storage type"] = df["Storage type"].fillna("Other")

In [None]:
sns.countplot(data=df, x="Storage type", alpha=0.7, saturation=0.5)

plt.title("Storage Type")

In [None]:
sns.histplot(data=df, x="Storage", kde=True)

plt.title("Distribution of Storage")

In [None]:
df["Storage"].unique()

In [None]:
sns.countplot(data=df, x="Status")

plt.title("Number of PCs by status")

plt.show()

In [None]:
status_counts = (
    df["Status"].value_counts().rename_axis("Status").reset_index(name="Count")
)

fig = px.pie(
    status_counts, names="Status", values="Count", title="Proportion of PC status"
)

fig.show()

In [None]:
popular_brands = (
    df["Brand"].value_counts().rename_axis("Brand").reset_index(name="Count")
)

popular_brands

In [None]:
fig = px.bar(
    popular_brands, x="Count", y="Brand", color="Brand", title="Most Popular Brands"
)

fig.show()

In [None]:
sns.countplot(data=df, x="Touch")

plt.title("Number of PCs by Touch")

plt.show()

In [None]:
status_counts = (
    df["Touch"].value_counts().rename_axis("Touch").reset_index(name="Count")
)

fig = px.pie(
    status_counts, names="Touch", values="Count", title="Proportion of PC by Touch"
)

fig.show()

In [None]:
df.Touch.unique()

In [None]:
df.columns

In [None]:
df["Status"].unique()

In [None]:
status_price = df.groupby("Status")["Final Price"].mean().reset_index()

px.bar(
    status_price,
    x="Status",
    y="Final Price",
    color="Status",
    title="Most expensive PCs by Status",
)

In [None]:
brand_price = df.groupby("Brand")["Final Price"].mean().reset_index()

brand_price

In [None]:
plt.stem(brand_price["Brand"], brand_price["Final Price"], basefmt="")

plt.xticks(rotation=90)

plt.xlabel("Brand")

plt.ylabel("Price")

plt.title("Lollipop Chart for most expensive Brands")

In [None]:
df["Model"].unique()

In [None]:
model_price = df.groupby("Model")["Final Price"].mean().reset_index()

model_price = model_price.nlargest(10, "Final Price").reset_index(drop=True)

model_price

In [None]:
plt.stem(model_price["Model"], model_price["Final Price"], basefmt="")

plt.xticks(rotation=90)

plt.xlabel("Model")

plt.ylabel("Price")

plt.title("Lollipop Chart for most expensive Models")

In [None]:
cpu_price = df.groupby("CPU")["Final Price"].mean().reset_index()

cpu_price = cpu_price.nlargest(10, "Final Price").reset_index(drop=True)

cpu_price

In [None]:
px.bar(
    cpu_price, x="Final Price", y="CPU", title="Most Expensive CPU Types", color="CPU"
)

In [None]:
df.columns

In [None]:
df["Touch"].unique()

In [None]:
df.isna().sum()

In [None]:
gpu_price = df.groupby("GPU")["Final Price"].mean().reset_index()

plt.figure(figsize=(15, 12))

plt.stem(gpu_price["GPU"], gpu_price["Final Price"], basefmt="")

plt.xticks(rotation=90)

plt.xlabel("GPU Type")

plt.ylabel("Price")

plt.title("GPU Type Price")

plt.show()

In [None]:
sns.kdeplot(data=df, x="Screen")

plt.title("Distribution of Screen")

In [None]:
sns.scatterplot(data=df, x="Final Price", y="Screen")

In [None]:
sns.barplot(data=df, x="Touch", y="Final Price")

plt.title("Price depending on Touch")

plt.show()

In [None]:
numeric_data = df.select_dtypes(include="number")

numeric_corr = numeric_data.corr()

sns.heatmap(numeric_corr, cmap="Purples", vmin=-1, vmax=1, annot=True)

In [None]:
sns.pairplot(df, diag_kind="kde")

**Observation**

1. Most Laptops have SSDs as their primary storage

2. There are multiple Gaussians in storage indicating the presence of Clusters

3. Most Laptops are refurbished accounting for a total percentage of 69.5

4. Asus, Lenovo, HP, MSI and Acer are the most popular brands

5. Most Laptops are non - touch and they account for 89.5% of the total Laptops

6. Refurbished Laptops cost more on average compared to newer ones ($1333.44)

7. Razer, Millenium, Samsung and Microsoft are four of the most expensive brands

8. Titan, WS63, Enduro, Blade, Beast are the most expensive models

9. AMD Radeon 9, Intel Euo core i9, intel core i9 and Apple M2 Pro are the most expensive CPU's on average

10. There are multiple Gaussians in the screens an indication of clusters in the screen

11. Laptops with Touch cost more than those without touch


Checking if the data are normally distributed using hypothesis

In [None]:
features = numeric_data.columns

num_features = len(numeric_data.columns)

num_cols = 2

num_rows = (num_features + num_cols - 1) // num_cols

for i, feature in enumerate(features):
    plt.subplot(num_rows, num_cols, i + 1)

    data = df[feature]

    stat, p = stats.shapiro(
        data
    )  # Utilizing the Shapiro-Wilk Test for hypothesis testing for normal Distribution

    print(f"Feature: {feature}")

    print("T- Statistic:", stat)

    print("P value:", p)

    if p > 0.05:
        print("The data is normally distributed")

    else:
        print("The data is not normally distributed")

    print("--------------------------------------------")

    stats.probplot(data, plot=plt)

    plt.title(f"Q-Q plot for {feature}")

plt.tight_layout()

plt.show()

Log Transformation

In [None]:
for feature in features:
    df[feature] = np.log1p(df[feature])

In [None]:
sns.pairplot(df, diag_kind="kde")

**Diagonal Analysis**

1. RAM has 5 Gaussians which may indicate presence of 5 clusters

2. Storage and Screen also have multiple Gaussians indicating presence of multiple clusters

3. The final Price show an almost perfect Normal distribution

**Relationship between the Dependent and Independent Variables**

1. Final Price v RAM: The higher the RAM the higher the final price

2. Final Price v Storage: Laptops with a higher storage cost more



# Data Modelling

In [None]:
# Dropping the Laptop column since it isnt helpful in the data modelling

df.drop(columns="Laptop", inplace=True)

In [None]:
object_columns = df.select_dtypes(include="object").columns

object_columns

In [None]:
# Label Encoding the Categorical Features since they are ordinal in nature

encoder = LabelEncoder()

for feature in object_columns:
    df[feature] = encoder.fit_transform(df[feature])

In [None]:
df.head()

In [None]:
# Separating the Target from the features and splitting it into training and testing data

X = df.drop(columns="Final Price")

y = df["Final Price"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0, test_size=0.2, shuffle=True
)

print("X Train shape: ", X_train.shape)

print("X Test shape: ", X_test.shape)

print("y Train shape: ", y_train.shape)

print("y Test shape: ", y_test.shape)

## **Linear Regression**

In [None]:
lr = LinearRegression()

lr.fit(X_train, y_train)

ypred_train = lr.predict(X_train)

ypred_test = lr.predict(X_test)

# Metrics

linear_rmse_test = mean_squared_error(y_test, ypred_test, squared=False)

linear_r2_score_test = r2_score(y_test, ypred_test)

linear_r2_score_train = r2_score(y_train, ypred_train)

# K fold cross-validation

k = 5

kfold_linear = KFold(n_splits=k, random_state=0, shuffle=True)

cv_linear = cross_val_score(lr, X, y, cv=kfold_linear, scoring="r2")

print("Linear Regression RMSE(Train):", linear_rmse_test)

print("Linear Regression R2 score (Train):", linear_r2_score_train)

print("Linear Regression R2 score (Test):", linear_r2_score_test)

print("Linear Regression CV Score mean(R^2):", cv_linear.mean())

## **Lasso Regression**

In [None]:
lasso = Lasso(alpha=0.1)

lasso.fit(X_train, y_train)

lasso_ypred_train = lasso.predict(X_train)

lasso_ypred_test = lasso.predict(X_test)

# Metrics

lasso_r2_score_test = r2_score(y_test, lasso_ypred_test)

lasso_r2_score_train = r2_score(y_train, lasso_ypred_train)

lasso_coeffs = lasso.coef_

# Print them

print("Lasso R^2 score(Test):", lasso_r2_score_test)

print("Lasso R^2 score(Train):", lasso_r2_score_train)

print("Lasso Coefficients:", lasso_coeffs)

## **Ridge Regression**

In [None]:
ridge = Ridge(alpha=0.1)

ridge.fit(X_train, y_train)

ridge_ypred_train = ridge.predict(X_train)

ridge_ypred_test = ridge.predict(X_test)

# Metrics

ridge_r2_score_test = r2_score(y_test, ridge_ypred_test)

ridge_r2_score_train = r2_score(y_train, ridge_ypred_train)


ridge_coeffs = ridge.coef_

# Print them

print("Ridge R^2 score(Test):", ridge_r2_score_test)

print("Ridge R^2 score(Train):", ridge_r2_score_train)

print("Ridge Coefficients:", ridge_coeffs)

## **ElasticNet Regression**

In [None]:
ent = ElasticNet(
    alpha=0.1, l1_ratio=0.5
)  # Adjusting the alpha and l1-ratio as required

ent.fit(X_train, y_train)

ent_ypred_train = ent.predict(X_train)

ent_ypred_test = ent.predict(X_test)

# Metrics

ent_r2_score_train = r2_score(y_train, ent_ypred_train)

ent_r2_score_test = r2_score(y_test, ent_ypred_test)

ent_rmse = mean_squared_error(y_test, ent_ypred_test, squared=False)

k = 5

ent_kfold = KFold(n_splits=k, random_state=0, shuffle=True)

ent_cross_val = cross_val_score(
    estimator=ent, scoring="r2", X=X, y=y, cv=ent_kfold, n_jobs=-1
)

# Printing the metrics

print("ElasticNet Regression R^2 (train): ", ent_r2_score_train)

print("ElasticNet Regression R^2 (test): ", ent_r2_score_test)

print("ElasticNet Regression RMSE: ", ent_rmse)

print("ElasticNet Regression Cross Validation Score:", ent_cross_val.mean())

## **Xtreme Gradient Boosting Model**

In [None]:
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=100)

xgb_model.fit(X_train, y_train)

xgb_model_ypred_train = xgb_model.predict(X_train)

xgb_model_ypred_test = xgb_model.predict(X_test)

### **HyperParameter Tuning for Xtreme Gradent Boosting model**

In [None]:
param_grid = {
    "alpha": [0.001, 0.01, 0.1, 1, 10],
    "lambda": [0.001, 0.01, 0.1, 1, 10],
    "gamma": [0.001, 0.01, 0.1, 1, 10],
    "n_estimators": [50, 100, 150],
    "max_depth": [2, 4, 6],
}

xgb_model = xgb.XGBRegressor(objective="reg:squarederror")

grid_search_xgb = GridSearchCV(
    estimator=xgb_model, param_grid=param_grid, cv=5, n_jobs=-1
)

grid_search_xgb.fit(X_train, y_train)

print(grid_search_xgb.best_params_)

In [None]:
rand_search = RandomizedSearchCV(
    estimator=xgb_model, param_distributions=param_grid, cv=5, n_jobs=-1
)

rand_search.fit(X_train, y_train)

print(rand_search.best_params_)

### Implementing with the Grid Search CV

In [None]:
xgb_model = xgb.XGBRegressor(
    objective="reg:squarederror",
    alpha=0.01,
    gamma=0.01,
    reg_lambda=0.1,
    max_depth=6,
    n_estimators=150,
)

xgb_model.fit(X_train, y_train)

xgb_model_ypred_train = xgb_model.predict(X_train)

xgb_model_ypred_test = xgb_model.predict(X_test)

xgb_grid_r2_train = r2_score(y_train, xgb_model_ypred_train)

xgb_grid_r2_test = r2_score(y_test, xgb_model_ypred_test)

xgb_grid_rmse_test = mean_squared_error(y_test, xgb_model_ypred_test, squared=False)

# Cross Validation

k = 5

xgb_kfold = KFold(n_splits=k, shuffle=True, random_state=0)

xgb_grid_cv = cross_val_score(estimator=xgb_model, cv=xgb_kfold, scoring="r2", X=X, y=y)


print("XGBoost Regression (Train) - R^2:", xgb_grid_r2_train)

print("XGBoost Regression (Test) - R^2:", xgb_grid_r2_test)

print("XGBoost Regression (Test) - RMSE:", xgb_grid_rmse_test)

print("XGBoost Regression CV Score:", xgb_grid_cv.mean())

Implementing with the Randomized Search CV

In [None]:
xgb_model = xgb.XGBRegressor(
    objective="reg:squarederror",
    alpha=0.01,
    gamma=0.001,
    reg_lambda=10,
    max_depth=4,
    n_estimators=50,
)

xgb_model.fit(X_train, y_train)

xgb_model_ypred_train = xgb_model.predict(X_train)

xgb_model_ypred_test = xgb_model.predict(X_test)

xgb_rand_r2_train = r2_score(y_train, xgb_model_ypred_train)

xgb_rand_r2_test = r2_score(y_test, xgb_model_ypred_test)

xgb_rand_rmse_test = mean_squared_error(y_test, xgb_model_ypred_test, squared=False)

# Cross Validation

k = 5

xgb_kfold = KFold(n_splits=k, shuffle=True, random_state=0)

xgb_rand_cv = cross_val_score(estimator=xgb_model, cv=xgb_kfold, scoring="r2", X=X, y=y)


print("XGBoost Regression (Train) - R^2:", xgb_rand_r2_train)

print("XGBoost Regression (Test) - R^2:", xgb_rand_r2_test)

print("XGBoost Regression (Test) - RMSE:", xgb_rand_rmse_test)

print("XGBoost Regression CV Score:", xgb_rand_cv.mean())

**Implementation on the Training Validation and Test Data**

In [None]:
df_copy = df.copy()

Xcopy = df_copy.drop(columns="Final Price")

ycopy = df["Final Price"]

# Splitting the data into temporary train and test data

Xcopy_temp, Xcopy_test, ycopy_temp, ycopy_test = train_test_split(
    Xcopy, ycopy, test_size=0.2, random_state=0, shuffle=True
)

# Splitting the Temporary data into Validation and Final Sets

Xcopy_train, Xcopy_val, ycopy_train, ycopy_val = train_test_split(
    Xcopy_temp, ycopy_temp, test_size=0.2, shuffle=True
)

print("Xcopy_train shape:", Xcopy_train.shape)

print("ycopy_train shape:", ycopy_train.shape)

print("Xcopy_val shape:", Xcopy_val.shape)

print("ycopy_val shape:", ycopy_val.shape)

print("Xcopy_test shape:", Xcopy_test.shape)

print("ycopy_test shape:", ycopy_test.shape)

In [None]:
# fitting the xgboost model to the training, testing and validating sets

xgb_model = xgb.XGBRegressor(
    objective="reg:squarederror",
    alpha=0.1,
    gamma=0.01,
    reg_lambda=1,
    max_depth=4,
    n_estimators=150,
)

# Implementation on the training set

xgb_model.fit(Xcopy_train, ycopy_train)

# Prediction on the training and validation set

xgb_ypred_train = xgb_model.predict(Xcopy_train)

xgb_ypred_val = xgb_model.predict(Xcopy_val)


# Calculate metrics for the training and validation sets

xgb_rmse_val = mean_squared_error(ycopy_val, xgb_ypred_val, squared=False)

xgb_r2_train = r2_score(ycopy_train, xgb_ypred_train)

xgb_r2_val = r2_score(ycopy_val, xgb_ypred_val)

print("XGBoost Regression (Train) - R^2:", xgb_r2_train)

print("XGBoost Regression (Validation) - R^2:", xgb_r2_val)

print("XGBoost Regression (Validation) - RMSE:", xgb_rmse_val)

# Predict on the test set

xgb_ypred_test = xgb_model.predict(Xcopy_test)

# Calculate metrics for the test set

xgb_rmse_test = mean_squared_error(ycopy_test, xgb_ypred_test, squared=False)

xgb_r2_test = r2_score(ycopy_test, xgb_ypred_test)

print("XGBoost Regression (Test) - R^2:", xgb_r2_test)

print("XGBoost Regression (Test) - RMSE:", xgb_rmse_test)

**Model Evaluation using OLS and VIF**

In [None]:
model1 = smf.ols("y~X", data=df).fit()

model1.summary()

In [None]:
independent_vars = df.drop(columns="Final Price").columns

vif_data = pd.DataFrame(columns=["Variable", "VIF"])

for var in independent_vars:
    X = df.drop(columns=[var, "Final Price"])

    y = df[var]

    X = sm.add_constant(X)

    model = sm.OLS(y, X)

    rsquared = model.fit().rsquared

    vif = 1 / (1 - rsquared)

    vif_data = vif_data.append({"Variable": var, "VIF": vif}, ignore_index=True)

vif_data

**Gradient and Hybrid Models**

In [None]:
models = [
    ("Gradient Boosting Regressor", GradientBoostingRegressor()),
    ("Support Vector Regressor", SVR()),
    ("Ada Boost Regressor", AdaBoostRegressor()),
]
k = 5

kfold = KFold(n_splits=k, shuffle=True, random_state=0)

for name, model in models:
    model.fit(X_train, y_train)

    ypred_train = model.predict(X_train)

    ypred_test = model.predict(X_test)

    rmse_test = np.sqrt(mean_squared_error(y_test, ypred_test))

    r2_test = r2_score(y_test, ypred_test)

    r2_train = r2_score(y_train, ypred_train)

    cv = cross_val_score(estimator=model, cv=kfold, X=X, y=y, scoring="r2")

    print(f"Model: {name}")

    print(f"{name} RMSE (Test):", rmse_test)

    print(f"{name} R^2 (Train):", r2_train)

    print(f"{name} R^2 (Test):", r2_test)

    print(f"{name} Cross Validation Mean:", cv.mean())

    print("------------------------------------------------------------------------")

In [None]:
param_grid = {
    "n_estimators": [100, 200, 300, 400, 500, 600],
    "learning_rate": [0.001, 0.01, 0.1, 1, 10, 100],
    "max_depth": [3, 4, 5],
    "min_samples_split": [2, 3, 4],
    "min_samples_leaf": [1, 2, 3],
}

In [None]:
gbr = GradientBoostingRegressor(random_state=0)

gbr_grid = GridSearchCV(
    estimator=gbr,
    n_jobs=-1,
    param_grid=param_grid,
    cv=5,
    scoring="neg_mean_squared_error",
)

gbr_grid.fit(X_train, y_train)

print(gbr_grid.best_params_)

In [None]:
gbr_best = GradientBoostingRegressor(
    learning_rate=0.1,
    max_depth=4,
    min_samples_leaf=3,
    min_samples_split=2,
    n_estimators=500,
)

gbr_best.fit(X_train, y_train)

gbr_ypred_train = gbr_best.predict(X_train)

gbr_ypred_test = gbr_best.predict(X_test)

gbr_rmse = np.sqrt(mean_squared_error(y_test, gbr_ypred_test))

gbr_r2_score_train = r2_score(y_train, gbr_ypred_train)

gbr_r2_score_test = r2_score(y_test, gbr_ypred_test)

k = 5

gbr_kfold = KFold(n_splits=k, shuffle=True, random_state=0)

gbr_cv = cross_val_score(
    estimator=gbr_best, scoring="r2", X=X_train, y=y_train, cv=gbr_kfold, n_jobs=-1
)

print("Gradient Boosting Regressor RMSE:", gbr_rmse)

print("Gradient Boosting Regressor R^2 score (Train):", gbr_r2_score_train)

print("Gradient Boosting Regressor R^2 score (Test):", gbr_r2_score_test)

print("Gradient Boosting Regressor CV score (Train):", gbr_cv.mean())

In [None]:
kernels = ["linear", "poly", "rbf", "sigmoid"]

for kernel in kernels:
    print(f"working on {kernel} kernel")

    svr_model = SVR(kernel=kernel)

    svr_model.fit(X_train, y_train)

    svr_ypred_train = svr_model.predict(X_train)

    svr_ypred_test = svr_model.predict(X_test)

    svr_model_rmse = np.sqrt(mean_squared_error(y_test, svr_ypred_test))

    svr_model_r2_score_train = r2_score(y_train, svr_ypred_train)

    svr_model_r2_score_test = r2_score(y_test, svr_ypred_test)

    k = 5

    svr_model_kfold = KFold(n_splits=k, shuffle=True, random_state=0)

    svr_model_cv = cross_val_score(
        estimator=svr_model,
        scoring="r2",
        X=X_train,
        y=y_train,
        cv=svr_model_kfold,
        n_jobs=-1,
    )

    print("Support Vector Regressor RMSE:", svr_model_rmse)

    print("Support Vector Regressor R^2 score (Train):", svr_model_r2_score_train)

    print("Support Vector Regressor R^2 score (Test):", svr_model_r2_score_test)

    print("Support Vector Regressor CV score (Train):", svr_model_cv.mean())

    print("--------------------------------------------------------------------")

In [None]:
dt = DecisionTreeRegressor(random_state=0)
dt.fit(X_train, y_train)

ypred_train = dt.predict(X_train)

ypred_test = dt.predict(X_test)

dt_rmse = mean_squared_error(y_test, ypred_test, squared=False)

dt_r2_test = r2_score(y_test, ypred_test)

dt_r2_train = r2_score(y_train, ypred_train)

k = 5

dt_kfold = KFold(n_splits=k, random_state=0, shuffle=True)

dt_cv = cross_val_score(estimator=dt, X=X, y=y, cv=dt_kfold, scoring="r2", n_jobs=-1)

print("Decision Tree Regressor RMSE (Test):", dt_rmse)

print("Decision Tree Regressor R^2 score(Test):", dt_r2_test)

print("Decision Tree Regressor R^2 score(Train):", dt_r2_train)

print("Decision Tree Regressor CV mean score(R^2):", dt_cv.mean())

In [None]:
rf = RandomForestRegressor(n_estimators=100, random_state=0)

rf.fit(X_train, y_train)

ypred_train = rf.predict(X_train)

ypred_test = rf.predict(X_test)

rf_rmse = mean_squared_error(y_test, ypred_test, squared=False)

rf_r2_test = r2_score(y_test, ypred_test)

rf_r2_train = r2_score(y_train, ypred_train)

k = 5

rf_kfold = KFold(n_splits=k, random_state=0, shuffle=True)

rf_cv = cross_val_score(estimator=rf, X=X, y=y, cv=rf_kfold, scoring="r2", n_jobs=-1)

print("Random Forest Regressor RMSE (Test):", rf_rmse)

print("Random Forest Regressor R^2 score(Test):", rf_r2_test)

print("Random Forest Regressor R^2 score(Train):", rf_r2_train)

print("Random Forest Regressor CV mean score(R^2):", rf_cv.mean())

**HyperParameter Tuning for the Random Forest Regressor**

In [None]:
param_dist = {
    "n_estimators": [100, 200, 300, 400, 500],
    "max_depth": [None, 10, 20, 30, 40, 50],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["auto", "sqrt", "log2"],
}

random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=20,
    n_jobs=-1,
    verbose=True,
    scoring="neg_mean_squared_error",
)

random_search.fit(X_train, y_train)

print(random_search.best_params_)

In [None]:
rf_best = RandomForestRegressor(
    n_estimators=400,
    min_samples_split=5,
    max_depth=None,
    max_features="log2",
    min_samples_leaf=1,
)

rf_best.fit(X_train, y_train)

ypred_train = rf_best.predict(X_train)

ypred_test = rf_best.predict(X_test)

rf_best_rmse = mean_squared_error(y_test, ypred_test, squared=False)

rf_best_r2_test = r2_score(y_test, ypred_test)

rf_best_r2_train = r2_score(y_train, ypred_train)

k = 5

rf_kfold = KFold(n_splits=k, random_state=0, shuffle=True)

rf_best_cv = cross_val_score(
    estimator=rf_best, X=X, y=y, cv=rf_kfold, scoring="r2", n_jobs=-1
)

print("Random Forest Regressor RMSE (Test):", rf_best_rmse)

print("Random Forest Regressor R^2 score(Test):", rf_best_r2_test)

print("Random Forest Regressor R^2 score(Train):", rf_best_r2_train)

print("Random Forest Regressor CV mean score(R^2):", rf_best_cv.mean())

In [None]:
data = {
    "models": [
        "Linear Regression",
        "Lasso",
        "Ridge",
        "ElasticNet",
        "XGBoost",
        "Gradient Boosting Regressor",
        "Decision Tree Regressor",
        "Random Forest Regressor",
    ],
    "Train R^2": [
        linear_r2_score_train,
        lasso_r2_score_train,
        ridge_r2_score_train,
        ent_r2_score_train,
        xgb_grid_r2_train,
        gbr_r2_score_train,
        dt_r2_train,
        rf_r2_train,
    ],
    "Test R^2": [
        linear_r2_score_test,
        lasso_r2_score_test,
        ridge_r2_score_test,
        ent_r2_score_test,
        xgb_grid_r2_test,
        gbr_r2_score_test,
        dt_r2_test,
        rf_r2_test,
    ],
    "CV Score": [
        cv_linear.mean(),
        None,
        None,
        ent_cross_val.mean(),
        xgb_grid_cv.mean(),
        gbr_cv.mean(),
        dt_cv.mean(),
        rf_cv.mean(),
    ],
}

model_results = pd.DataFrame(data)

model_results

In [None]:
# Adding the Adaboost and Support Vector Regressor

additional_data = {
    "models": ["AdaBoost Regressor", "Support Vector Regressor"],
    "Train R^2": [0.7458336089169646, 0.5829699787103915],
    "Test R^2": [0.7025392796430312, 0.5972579729022283],
    "CV Score": [0.2706294228097595, 0.00714837840855771],
}


additional_data = pd.DataFrame(additional_data)

model_results = model_results.append(additional_data, ignore_index=True)

model_results

In [None]:
# Additional SVR Data

additional_svr_data = {
    "models": [
        "Support Vector Regressor(Linear)",
        "Support Vector Regressor(Poly)",
        "Support Vector Regressor(rbf)",
        "Support Vector Regressor(sigmoid)",
    ],
    "Train R^2": [None, None, None, None],
    "Test R^2": [
        0.695419935454759,
        0.5549125603135387,
        0.5972579729022283,
        -3158.4152173762855,
    ],
    "CV Score": [
        0.6870751989720134,
        0.5120484354271622,
        0.5563428220746509,
        -1821.4414598670064,
    ],
}

additional_svr_data = pd.DataFrame(additional_svr_data)

model_results = model_results.append(additional_svr_data, ignore_index=True)

model_results

**Hybrid Modelling**

**Random Forest Regressor and XGBoost**

In [None]:
rf = RandomForestRegressor(n_estimators=100, random_state=0)

rf.fit(X_train, y_train)

ypred_train = rf.predict(X_train)

ypred_test = rf.predict(X_test)

xgb_model = xgb.XGBRegressor(
    objective="reg:squarederror",
    alpha=0.01,
    gamma=0.01,
    reg_lambda=0.1,
    max_depth=6,
    n_estimators=150,
)

xgb_model.fit(X_train, y_train)

xgb_model_ypred_train = xgb_model.predict(X_train)

xgb_model_ypred_test = xgb_model.predict(X_test)

hybrid_ypred_train = (ypred_train + xgb_model_ypred_train) / 2

hybrid_ypred_test = (ypred_test + xgb_model_ypred_test) / 2

# Metrics for the Hybrid Model

hybrid_rmse_test = np.sqrt(mean_squared_error(y_test, hybrid_ypred_test))

hybrid_r2_score_train = r2_score(y_train, hybrid_ypred_train)

hybrid_r2_score_test = r2_score(y_test, hybrid_ypred_test)

k = 5

hybrid_kfold = KFold(n_splits=k, shuffle=True, random_state=0)

hybrid_cv = cross_val_score(cv=hybrid_kfold, scoring="r2", estimator=rf, X=X, y=y)

print("Hybrid R^2 test(Train):", hybrid_r2_score_train)

print("Hybrid R^2 test(Test):", hybrid_r2_score_test)

print("Hybrid RMSE (Test):", hybrid_rmse_test)

print("Hybrid Cross Validation Test:", hybrid_cv.mean())


# Plotting the Metrics for the Hybrid Model

plt.scatter(y_test, hybrid_ypred_test, c="b", label="Predicted", alpha=0.5)

plt.scatter(y_test, y_test, c="r", label="Actual", alpha=0.5)

plt.xlabel("Actual Values")

plt.ylabel("Predicted Values")

plt.title("Actual values v Predicted Values")

plt.legend()

plt.show()

# Calculating the Residuals

residuals = y_test - hybrid_ypred_test

# Defining the colors of the bubbles based on the size of the residuals

colors = np.abs(residuals)

# Visualizing the residuals

plt.scatter(y_test, residuals, c=colors, cmap="coolwarm", alpha=0.7)

plt.xlabel("Actual Values")

plt.ylabel("Residuals")

plt.legend()

plt.title("Actual Values V Residual Values")

plt.colorbar(label="Residual Magnitude")

plt.show()

# Calculating the error

errors = y_test - hybrid_ypred_test

# Visualizing the error using a histplot

sns.histplot(errors, kde=True)

plt.xlabel("Error")

plt.ylabel("Frequency")

plt.title("Error Distribution of the hybrid model")

plt.show()

# Measures of Central Tendency

mean_error = np.mean(errors)

median_error = np.median(errors)

# Measures of Spread

std_error = np.std(errors)

# Calculate the percentage of errors within one standard deviation of the mean

within_one_std = errors[
    (errors > mean_error - std_error) & (errors < mean_error + std_error)
]

percentage = len(within_one_std) / len(errors) * 100

print(
    f"Approximately {percentage:.2f}% of errors are within one standard deviation of the mean"
)

# Visualization of the measures of Central Tendencies and Spread

plt.axvline(mean_error, color="r", label=f"Mean error {mean_error}", linestyle="--")

plt.axvline(
    median_error, color="b", label=f"Median error {median_error}", linestyle="--"
)

plt.axvline(
    std_error + mean_error,
    color="g",
    label=f"Standard Error {std_error}",
    linestyle="--",
)

plt.axvline(mean_error - std_error, color="g", linestyle="--")

plt.legend()

plt.show()

**Light Gradient Boosting Machine and XGBoost**

In [None]:
lgb = lgbm.LGBMRegressor(n_estimators=100, max_depth=2, random_state=0)

lgb.fit(X_train, y_train)

lgb_ypred_train = lgb.predict(X_train)

lgb_ypred_test = lgb.predict(X_test)

hybrid_lgb_pred_train = (lgb_ypred_train + xgb_model_ypred_train) / 2

hybrid_lgb_pred_test = (lgb_ypred_test + xgb_model_ypred_test) / 2

# Metrics for the Hybrid Model

hybrid_lgb_rmse_test = np.sqrt(mean_squared_error(y_test, hybrid_lgb_pred_test))

hybrid_lgb_r2_score_train = r2_score(y_train, hybrid_lgb_pred_train)

hybrid_lgb_r2_score_test = r2_score(y_test, hybrid_lgb_pred_test)

k = 5

hybrid_lgb_kfold = KFold(n_splits=k, shuffle=True, random_state=0)

hybrid_lgb_cv = cross_val_score(
    cv=hybrid_lgb_kfold, scoring="r2", estimator=lgb, X=X, y=y
)

print("Hybrid R^2 test(Train):", hybrid_lgb_r2_score_train)

print("Hybrid R^2 test(Test):", hybrid_lgb_r2_score_test)

print("Hybrid RMSE (Test):", hybrid_lgb_rmse_test)

print("Hybrid Cross Validation Test:", hybrid_lgb_cv.mean())


# Plotting the Metrics for the Hybrid Model

plt.scatter(y_test, hybrid_lgb_pred_test, c="b", label="Predicted", alpha=0.5)

plt.scatter(y_test, y_test, c="r", label="Actual", alpha=0.5)

plt.xlabel("Actual Values")

plt.ylabel("Predicted Values")

plt.title("Actual values v Predicted Values")

plt.legend()

plt.show()

# Calculating the Residuals

residuals = y_test - hybrid_lgb_pred_test

# Defining the colors of the bubbles based on the size of the residuals

colors = np.abs(residuals)

# Visualizing the residuals

plt.scatter(y_test, residuals, c=colors, cmap="coolwarm", alpha=0.7)

plt.xlabel("Actual Values")

plt.ylabel("Residuals")

plt.legend()

plt.title("Actual Values V Residual Values")

plt.colorbar(label="Residual Magnitude")

plt.show()

# Calculating the error

errors = y_test - hybrid_lgb_pred_test

# Visualizing the error using a histplot

sns.histplot(errors, kde=True)

plt.xlabel("Error")

plt.ylabel("Frequency")

plt.title("Error Distribution of the hybrid model")

plt.show()

# Measures of Central Tendency

mean_error = np.mean(errors)

median_error = np.median(errors)

# Measures of Spread

std_error = np.std(errors)

# Calculate the percentage of errors within one standard deviation of the mean
within_one_std = errors[
    (errors > mean_error - std_error) & (errors < mean_error + std_error)
]

percentage = len(within_one_std) / len(errors) * 100

print(
    f"Approximately {percentage:.2f}% of errors are within one standard deviation of the mean"
)


# Visualization of the measures of Central Tendencies and Spread

plt.axvline(mean_error, color="r", label=f"Mean error {mean_error}", linestyle="--")

plt.axvline(
    median_error, color="b", label=f"Median error {median_error}", linestyle="--"
)

plt.axvline(
    std_error + mean_error,
    color="g",
    label=f"Standard Error {std_error}",
    linestyle="--",
)

plt.axvline(mean_error - std_error, color="g", linestyle="--")

plt.legend()

plt.show()

**Visualizing the Best Model XGBoost**

In [None]:
# Plotting the Metrics for the Hybrid Model

plt.scatter(y_test, xgb_model_ypred_test, c="b", label="Predicted", alpha=0.5)

plt.scatter(y_test, y_test, c="r", label="Actual", alpha=0.5)

plt.xlabel("Actual Values")

plt.ylabel("Predicted Values")

plt.title("Actual values v Predicted Values")

plt.legend()

plt.show()

# Calculating the Residuals

residuals = y_test - xgb_model_ypred_test

# Defining the colors of the bubbles based on the size of the residuals

colors = np.abs(residuals)

# Visualizing the residuals

plt.scatter(y_test, residuals, c=colors, cmap="autumn_r", alpha=0.7)

plt.xlabel("Actual Values")

plt.ylabel("Residuals")

plt.legend()

plt.title("Actual Values V Residual Values")

plt.colorbar(label="Residual Magnitude")

plt.show()

# Calculating the error

errors = y_test - xgb_model_ypred_test

sns.histplot(errors, kde=True)

plt.xlabel("Error")

plt.ylabel("Frequency")

plt.title("Error Distribution of the hybrid model")

plt.show()

# Measures of Central Tendency

mean_error = np.mean(errors)

median_error = np.median(errors)

# Measures of Spread

std_error = np.std(errors)

# Calculate the percentage of errors within one standard deviation of the mean

within_one_std = errors[
    (errors > mean_error - std_error) & (errors < mean_error + std_error)
]

percentage = len(within_one_std) / len(errors) * 100

print(
    f"Approximately {percentage:.2f}% of errors are within one standard deviation of the mean"
)

# Visualizing the error using a histplot

# Visualization of the measures of Central Tendencies and Spread

plt.axvline(mean_error, color="r", label=f"Mean error {mean_error}", linestyle="--")

plt.axvline(
    median_error, color="b", label=f"Median error {median_error}", linestyle="--"
)

plt.axvline(
    std_error + mean_error,
    color="g",
    label=f"Standard Error {std_error}",
    linestyle="--",
)

plt.axvline(mean_error - std_error, color="g", linestyle="--")

plt.legend()

plt.show()