In [None]:
# import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
# import data and rename coloumns
file_path = "D:/470Final/didfinal02.csv"
df = pd.read_csv(file_path, parse_dates=['date'])

df.rename(columns={'BUYER_DEA_NO':'buyer', 'allunits':'total_units','units':'total_oxy', 'Dummy_Before':'pre', 'Dummy_After':'post', 'chaindummy':'chain',
                   'unemployment_rate':'unemploy', 'labor_force':'lf'}, inplace=True)

# View Data
print(df.dtypes)
print(df.head())

In [None]:
# Splitting the dataset into features and target
X = df[['pre', 'post', 'chain']]
y = df['total_oxy']

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



# Model training
model = GradientBoostingRegressor(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model with all features
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print(f"Mean Squared Error: {mse}")

# 1. Feature Importance Plot
plt.figure(figsize=(10, 5))
feature_importance = model.feature_importances_
sorted_idx = np.argsort(feature_importance)
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), X.columns[sorted_idx])
plt.title('Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

In [None]:
# Splitting the dataset into features and target
X = df[['pre', 'post', 'chain', 'unemploy', 'lf']]
y = df['total_oxy']


# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



# Model training
model = GradientBoostingRegressor(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model with all features
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print(f"Mean Squared Error: {mse}")

# 1. Feature Importance Plot
plt.figure(figsize=(10, 5))
feature_importance = model.feature_importances_
sorted_idx = np.argsort(feature_importance)
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), X.columns[sorted_idx])
plt.title('Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()