In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load datasets
train = pd.read_csv('/content/train.csv')
stores = pd.read_csv('/content/stores.csv')
features = pd.read_csv('/content/features.csv')
test = pd.read_csv('/content/test.csv')

# Merge datasets for training
data = train.merge(stores, on="Store").merge(features, on=["Store", "Date", "IsHoliday"])

# Merge datasets for testing
test_data = test.merge(stores, on="Store").merge(features, on=["Store", "Date", "IsHoliday"])

# Drop unnecessary columns
data.drop(['MarkDown1', 'MarkDown5'], axis=1, inplace=True)
test_data.drop(['MarkDown1', 'MarkDown5'], axis=1, inplace=True)

# Handle missing values
data.fillna(0, inplace=True)
test_data.fillna(0, inplace=True)

# Convert Date to datetime and extract relevant features
data['Date'] = pd.to_datetime(data['Date'])
test_data['Date'] = pd.to_datetime(test_data['Date'])
data['Year'] = data['Date'].dt.year
test_data['Year'] = test_data['Date'].dt.year
data['Month'] = data['Date'].dt.month
test_data['Month'] = test_data['Date'].dt.month
data['Week'] = data['Date'].dt.isocalendar().week
test_data['Week'] = test_data['Date'].dt.isocalendar().week

# One-hot encode categorical variables
data = pd.get_dummies(data, columns=['Type'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['Type'], drop_first=True)

# Define features and target
X = data.drop(['Weekly_Sales', 'Date'], axis=1)
y = data['Weekly_Sales']
X_test_final = test_data.drop(['Date'], axis=1)  # Retain all features for test

# Split training dataset for evaluation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_val)

# Evaluation Metrics
mse = mean_squared_error(y_val, y_pred)
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print("Linear Regression Performance:")
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R^2 Score: {r2}")

# Predict on test data
test_predictions = model.predict(X_test_final)

# Save test predictions
test_data['Predicted_Weekly_Sales'] = test_predictions
test_data.to_csv('test_predictions.csv', index=False)


Linear Regression Performance:
Mean Squared Error: 473219590.410396
Mean Absolute Error: 14561.495518148897
R^2 Score: 0.09252940839736068
