# Feature Selection

In [53]:
import pandas as pd
import numpy as np

train = pd.read_csv('train.csv')
corr_matrix = train.corr()
corr_values = corr_matrix['Weekly_Sales'].sort_values(ascending=False)
features = corr_values[1:13].index.tolist()
features

['mean',
 'median',
 'min',
 'max',
 'std',
 'Size',
 'Dept',
 'Total_MarkDown',
 'Month',
 'Week',
 'Temperature',
 'Fuel_Price']

# Random Forest

In [54]:
from sklearn.ensemble import RandomForestRegressor

promotional_weeks = ['2019-02-12', '2020-02-11', '2021-02-10', '2019-09-10', '2020-09-09', 
                     '2021-09-07', '2019-11-26', '2020-11-25', '2019-12-31', '2020-12-30']
train['Promotion_Week'] = [5 if d in promotional_weeks else 1 for d in train['Week']]

train_data, val_data = train_test_split(train, test_size=0.2, random_state=42)

X_train = train_data[features]
y_train = train_data['Weekly_Sales']
X_val = val_data[features]
y_val = val_data['Weekly_Sales']

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
rmse = mean_squared_error(y_val, y_pred, squared=False)
print('Test set RMSE:', rmse)



Test set RMSE: 2394.86118759177


# Evaluation

In [55]:
test = pd.read_csv('test.csv')
test['Promotion_Week'] = [5 if d in promotional_weeks else 1 for d in test['Week']]
test.fillna(0, inplace=True)

X_test = test[features]

y_test = model.predict(X_test)

test['Weekly_Sales'] = y_test
test[['id', 'Weekly_Sales']].to_csv('sample.csv', index=False)
