In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Load cleaned data (adjust path if necessary)
data = pd.read_csv('../data/cleaned_supermart_data.csv')
print("✅ Available Columns:", data.columns.tolist())

# Prepare features and target
# Drop non-numeric and non-relevant columns: Order_ID, Customer_Name, Order_Date, Sales, Month
features = data.drop(columns=['Order_ID', 'Customer_Name', 'Order_Date', 'Sales', 'Month'])
target = data['Sales']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
print("✅ Data Split Completed")

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)
print("✅ Model Training Completed")

# Make Predictions and Evaluate the Model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"📉 Mean Squared Error: {mse}")
print(f"📊 R-squared Score: {r2}")
print("✅ Model Building Completed!")


✅ Available Columns: ['Order_ID', 'Customer_Name', 'Category', 'Sub_Category', 'City', 'Order_Date', 'Region', 'Sales', 'Discount', 'Profit', 'State', 'Month', 'Order_Year']
✅ Data Split Completed
✅ Model Training Completed
📉 Mean Squared Error: 211276.5081733436
📊 R-squared Score: 0.3684277378738583
✅ Model Building Completed!
