In [None]:
# Importing the libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    StackingRegressor,
)
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from scipy.stats import norm, skew
from scipy.special import boxcox1p


In [None]:
# Load the data from Excel file
data = pd.read_excel("filename.xlsb")

# Remove unnecessary columns
data = data.drop(columns=["material_ref", "product_ref", "delivery date"])

# Replace missing values with the mean value of that column
data = data.fillna(data.mean())


In [None]:
# One-hot encode categorical variables
categorical_columns = ["status", "item type"]
enc = OneHotEncoder()
enc.fit(data[categorical_columns])
encoded_data = pd.DataFrame(enc.transform(data[categorical_columns]).toarray())
data = pd.concat([data, encoded_data], axis=1)
data = data.drop(columns=categorical_columns)

# Normalize the numerical variables using MinMaxScaler
scaler = MinMaxScaler()
numerical_columns = ["quantity tons", "thickness", "width"]
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Analyze the relationships between variables using statistical and visualization methods
sns.pairplot(data)
plt.show()

# Analyze the correlation between variables
corr = data.corr()
sns.heatmap(corr, annot=True)
plt.show()


In [None]:
# Analyze the relationship between the target variable 'selling_price' and the numerical variable 'quantity tons'
sns.scatterplot(x="quantity tons", y="selling_price", data=data)
plt.show()

# Split the data into training and testing sets
X = data.drop(columns=["selling_price"])
y = data["selling_price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)


In [None]:
# Build regression models
models = [
    ("Linear Regression", LinearRegression()),
    ("Ridge Regression", Ridge()),
    ("Lasso Regression", Lasso()),
    ("ElasticNet Regression", ElasticNet()),
    ("Decision Tree Regression", DecisionTreeRegressor()),
    ("Random Forest Regression", RandomForestRegressor()),
    ("Gradient Boosting Regression", GradientBoostingRegressor()),
    ("Neural Network Regression", MLPRegressor()),
]

results = []
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    results.append((name, r2, mae, mse))

results_df = pd.DataFrame(results, columns=["Model", "R2 Score", "MAE", "MSE"])
print(results_df)


In [None]:
# Hyperparameter tuning using GridSearchCV
params = {
    "n_estimators": [50, 100, 150],
    "max_depth": [2, 4, 6, 8],
    "min_samples_split": [2, 4, 6],
    "min_samples_leaf": [1, 2, 4],
}


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Create random forest regression model
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)

# Train random forest regression model on training set
rf_reg.fit(X_train, y_train)

# Evaluate random forest regression model on testing set
rf_reg_y_pred = rf_reg.predict(X_test)
rf_reg_r2_score = r2_score(y_test, rf_reg_y_pred)
rf_reg_mae = mean_absolute_error(y_test, rf_reg_y_pred)
rf_reg_mse = mean_squared_error(y_test, rf_reg_y_pred)

rf_reg_grid = GridSearchCV(rf_reg, params, cv=5, n_jobs=-1, verbose=1)

# Print evaluation metrics for random forest regression model
print("Random Forest Regression Model Evaluation Metrics:")
print("R2 Score: ", rf_reg_r2_score)
print("Mean Absolute Error: ", rf_reg_mae)
print("Mean Squared Error: ", rf_reg_mse)
