### A Random Forest Regressor is trained on sampled data 
### Evaluated on a validation set 
### Used to predict sales for the test data 
### Finally, predictions are saved in a submission file for the competition

# Importing Required Libraries

In [89]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")

# Step 1: Data Loading


In [90]:
data = pd.read_csv("/Users/sandeepsrinivasdwaram/Downloads/store-sales-time-series-forecasting/train.csv")
test_data = pd.read_csv("/Users/sandeepsrinivasdwaram/Downloads/store-sales-time-series-forecasting/test.csv")
store_info = pd.read_csv("/Users/sandeepsrinivasdwaram/Downloads/store-sales-time-series-forecasting/stores.csv")
oil_prices = pd.read_csv("/Users/sandeepsrinivasdwaram/Downloads/store-sales-time-series-forecasting/oil.csv")
holidays = pd.read_csv("/Users/sandeepsrinivasdwaram/Downloads/store-sales-time-series-forecasting/holidays_events.csv")

In [91]:
data.head(5)

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0
4,4,2013-01-01,1,BOOKS,0.0,0


In [92]:
test_data.head(5)

Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2
3,3000891,2017-08-16,1,BEVERAGES,20
4,3000892,2017-08-16,1,BOOKS,0


# Step 2: Data Merging

In [93]:
data = data.merge(store_info, on="store_nbr", how="left")
data = data.merge(oil_prices, on="date", how="left")
data = data.merge(holidays, on="date", how="left")
test_data = test_data.merge(store_info, on="store_nbr", how="left")
test_data = test_data.merge(oil_prices, on="date", how="left")
test_data = test_data.merge(holidays, on="date", how="left")

In [94]:
data.shape

(3054348, 16)

In [95]:
test_data.shape

(28512, 15)

# Step 3: Handling Missing Values

In [96]:
data["onpromotion"].fillna(0, inplace=True)
data["dcoilwtico"].fillna(method="ffill", inplace=True)
data["dcoilwtico"].fillna(method="bfill", inplace=True)
test_data["onpromotion"].fillna(0, inplace=True)
test_data["dcoilwtico"].fillna(method="ffill", inplace=True)
test_data["dcoilwtico"].fillna(method="bfill", inplace=True)

# Step 4: Features

In [97]:
data["day"] = pd.to_datetime(data["date"]).dt.day
data["month"] = pd.to_datetime(data["date"]).dt.month
data["year"] = pd.to_datetime(data["date"]).dt.year
data["day_of_week"] = pd.to_datetime(data["date"]).dt.dayofweek
test_data["day"] = pd.to_datetime(test_data["date"]).dt.day
test_data["month"] = pd.to_datetime(test_data["date"]).dt.month
test_data["year"] = pd.to_datetime(test_data["date"]).dt.year
test_data["day_of_week"] = pd.to_datetime(test_data["date"]).dt.dayofweek

In [98]:
# Define categorical columns
categorical_columns = ["family", "city", "state", "type", "cluster"]

In [99]:
available_columns_data = [col for col in categorical_columns if col in data.columns]
available_columns_test_data = [col for col in categorical_columns if col in test_data.columns]

In [100]:
data = pd.get_dummies(data, columns=available_columns_data, drop_first=True)
test_data = pd.get_dummies(test_data, columns=available_columns_test_data, drop_first=True)

In [101]:
test_data = test_data.reindex(columns=data.columns, fill_value=0)

In [102]:
test_data = test_data.drop(columns=["sales"], errors="ignore")

# Step 5: Define Features and Target

In [103]:
X = data.drop(columns=["id", "date", "sales"])
y = data["sales"]
X_test = test_data.drop(columns=["id", "date"], errors="ignore")

In [104]:
categorical_columns = ['family', 'city', 'state', 'type', 'cluster']
for col in categorical_columns:
    if col in data.columns:
        data[col] = pd.Categorical(data[col]).codes
        test_data[col] = pd.Categorical(test_data[col]).codes
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for col in categorical_columns:
    if col in data.columns:
        data[col] = le.fit_transform(data[col].astype(str))
        test_data[col] = le.transform(test_data[col].astype(str))

In [105]:
X.shape

(3054348, 97)

In [106]:
y.shape

(3054348,)

In [107]:
X_test.shape

(28512, 97)

# Step 6: Train-Test Split

In [108]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Encoding for Non-Numeric Data

In [109]:
le = LabelEncoder()
for col in X_train.select_dtypes(include=['object']).columns:
    X_train[col] = le.fit_transform(X_train[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))

In [110]:
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")

X_train shape: (2443478, 97)
y_train shape: (2443478,)


In [111]:
X_val = X_val.reindex(columns=X_train.columns, fill_value=0)

In [112]:
non_numeric_columns = X_val.select_dtypes(include=['object']).columns
print(non_numeric_columns)
print(X_val[non_numeric_columns].head())

Index(['type_x', 'type_y', 'locale', 'locale_name', 'description',
       'transferred'],
      dtype='object')
        type_x      type_y locale locale_name           description  \
668753       D         NaN    NaN         NaN                   NaN   
1267004      D  Additional  Local       Quito  Fundacion de Quito-1   
997442       A         NaN    NaN         NaN                   NaN   
873596       B         NaN    NaN         NaN                   NaN   
756145       D         NaN    NaN         NaN                   NaN   

        transferred  
668753          NaN  
1267004       False  
997442          NaN  
873596          NaN  
756145          NaN  


In [113]:
X_val[['type_x', 'type_y', 'locale', 'locale_name', 'description']] = \
    X_val[['type_x', 'type_y', 'locale', 'locale_name', 'description']].fillna('Unknown')

In [114]:
X_val['transferred'] = X_val['transferred'].fillna(False)

In [115]:
X_val = pd.get_dummies(X_val, columns=['type_x', 'type_y', 'locale', 'locale_name', 'description'], drop_first=True)

In [116]:
X_val = X_val.reindex(columns=X_train.columns, fill_value=0)

In [117]:
print(X_val.dtypes)
assert X_val.select_dtypes(include=['object']).empty, "Non-numeric columns still exist!"

store_nbr        int64
onpromotion      int64
type_x           int64
dcoilwtico     float64
type_y           int64
                ...   
cluster_13        bool
cluster_14        bool
cluster_15        bool
cluster_16        bool
cluster_17        bool
Length: 97, dtype: object


# Step 8: Model Training

In [118]:
X_train_sample = X_train.sample(n=1000, random_state=42)
y_train_sample = y_train.loc[X_train_sample.index]

In [119]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_sample, y_train_sample)

# Step 9: Validation

In [120]:
y_pred = model.predict(X_val)
rmsle = np.sqrt(mean_squared_log_error(y_val, y_pred))
print(f"Validation RMSLE: {rmsle}")

Validation RMSLE: 2.5990431686995312


# Step 10: Test Predictions

In [121]:
test_data["sales"] = model.predict(X_test)
test_data["sales"] = np.clip(test_data["sales"], 0, None)

# Step 11: Submission File

In [122]:
submission = test_data[["id", "sales"]]
submission.to_csv("submission.csv", index=False)

print("Submission file created successfully!")

Submission file created successfully!


In [123]:
submission

Unnamed: 0,id,sales
0,3000888,8.059590
1,3000889,7.939590
2,3000890,118.364619
3,3000891,4446.170000
4,3000892,7.909590
...,...,...
28507,3029395,280.657042
28508,3029396,93.266980
28509,3029397,1188.462862
28510,3029398,342.903810
