In [None]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
import datetime

# Load the Data
train = pd.read_csv('C:\\Users\\ESHOP\\Documents\\Kaggle Practice Project\\Datasets\\train (1).csv') 
test = pd.read_csv('C:\\Users\\ESHOP\\Documents\\Kaggle Practice Project\\Datasets\\test (1).csv')

In [None]:
# Data Preprocessing
train['date'] = pd.to_datetime(train['date'])
# Extract features from date
train['year'] = train['date'].dt.year
train['month'] = train['date'].dt.month
train['day'] = train['date'].dt.day
train['day_of_week'] = train['date'].dt.dayofweek

In [None]:
# Handle promotions: fill missing values
train['promo'] = train['promo'].fillna(0)

In [None]:
# Handle categorical variables
train = pd.get_dummies(train, columns=['store_id', 'item_id'], drop_first=True)

In [None]:
# Drop unnecessary columns
train.drop(['id', 'date', 'sales'], axis=1, inplace=True)

# Prepare features and target
X = train.drop('units_sold', axis=1)  # Assuming the target variable is named 'units_sold'
y = train['units_sold']

In [None]:
# Model Training
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
# Validate the model
y_val_pred = model.predict(X_val)
rmsle = np.sqrt(mean_squared_log_error(y_val, y_val_pred))
print(f'Validation RMSLE: {rmsle}')

In [None]:
# Prediction
# Preprocess the test data
test['date'] = pd.to_datetime(test['date'])
test['year'] = test['date'].dt.year
test['month'] = test['date'].dt.month
test['day'] = test['date'].dt.day
test['day_of_week'] = test['date'].dt.dayofweek
test['promo'] = test['promo'].fillna(0)
test = pd.get_dummies(test, columns=['store_id', 'item_id'], drop_first=True)

In [None]:
# Ensure test data has the same columns as train data
X_test = test.drop(['id', 'date'], axis=1)

# Make predictions
sales_predictions = model.predict(X_test)


In [None]:
# Prepare Submission
submission = pd.DataFrame({
    'id': test['id'],
    'sales': sales_predictions
})

submission.to_csv('submission.csv', index=False)
print('Submission file created: submission.csv')
