In [None]:
import os
import matplotlib
import opendatasets as od
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline

sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10 , 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

In [None]:
od.download('https://www.kaggle.com/competitions/rossmann-store-sales/overview')

In [None]:
os.listdir('rossmann-store-sales')

In [None]:
ross_df = pd.read_csv('./rossmann-store-sales/train.csv', low_memory=False)

In [None]:
ross_df

In [None]:
store_df = pd.read_csv('./rossmann-store-sales/store.csv')

In [None]:
store_df

In [None]:
merged_df = ross_df.merge(store_df, how='left', on='Store')
merged_df

In [None]:
merged_df.shape

In [None]:
test_df = pd.read_csv('rossmann-store-sales/test.csv')

In [None]:
merged_test_df = test_df.merge(store_df, how='left', on='Store') 

In [None]:
merged_test_df

In [None]:
test_df

In [None]:
merged_df.info()

In [None]:
round(merged_df.describe().T,2)

In [None]:
 merged_df.duplicated().sum()

In [None]:
merged_df['Date'] = pd.to_datetime(merged_df.Date)

In [None]:
merged_test_df['Date'] = pd.to_datetime(merged_test_df.Date)

In [None]:
merged_test_df.Date.min(), merged_test_df.Date.max()

In [None]:
sns.histplot(data=merged_df, x='Sales')

In [None]:
merged_df.Open.value_counts()

Since taking the values of the store when it was closed it is waste of training model with that data so we will remove the data that is present 

In [None]:
merged_df = merged_df[merged_df.Open==1].copy()

In [None]:
sns.histplot(data=merged_df, x='Sales')

In [None]:
plt.figure(figsize=(18,8))
temp_df = merged_df.sample(40000)
sns.scatterplot(x=temp_df.Sales, y=temp_df.Customers, hue=temp_df.Date.dt.year, alpha=0.8)
plt.title("Sales Vs Customers")
plt.show()

In [None]:
plt.figure(figsize=(18,8))
temp_df = merged_df.sample(40000)
sns.scatterplot(x=temp_df.Store, y=temp_df.Sales, hue=temp_df.Date.dt.year, alpha=0.8)
plt.title("Stores Vs Sales")
plt.show()

In [None]:
sns.barplot(data=merged_df, x='DayOfWeek', y='Sales')

In [None]:
sns.barplot(data=merged_df, x='Promo', y='Sales')

In [None]:
numeric_df = merged_df.select_dtypes(include=['number'])
numeric_df.corr()['Sales'].sort_values(ascending=False)

Now we will look at feature engineering
Feature Engineering is the process of creating new features by transforming existing features or by corporating data from external sources

In [None]:
merged_df

In [None]:
merged_df['Day'] = merged_df.Date.dt.day
merged_df['Month'] = merged_df.Date.dt.month
merged_df['Year'] = merged_df.Date.dt.year

In [None]:
merged_test_df['Day'] = merged_test_df.Date.dt.day
merged_test_df['Month'] = merged_test_df.Date.dt.month
merged_test_df['Year'] = merged_test_df.Date.dt.year

In [None]:
sns.barplot(data=merged_df, x='Year', y='Sales')

In [None]:
sns.barplot(data=merged_df, x='Month', y='Sales')

In [None]:
len(merged_df)

In [None]:
train_size = int(.75 * len(merged_df))
train_size

In [None]:
sorted_df = merged_df.sort_values('Date')
train_df, val_df = sorted_df[:train_size], sorted_df[train_size:]

In [None]:
len(train_df), len(val_df)

In [None]:
train_df

In [None]:
train_df.Date.min(), train_df.Date.max()

In [None]:
val_df.Date.min(), val_df.Date.min()

In [None]:
train_df.columns

In [None]:
input_cols = ['Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'StoreType',
              'Assortment', 'Day', 'Month', 'Year']

In [None]:
target_col = 'Sales'

In [None]:
merged_df[input_cols].nunique()

In [None]:
train_inputs = train_df[input_cols].copy()
train_targets = train_df[target_col].copy()

In [None]:
val_inputs = val_df[input_cols].copy()
val_targets = val_df[target_col].copy()

In [None]:
numeric_cols = ['Store', 'Day', 'Month', 'Year']
categorical_cols = ['DayOfWeek', 'Promo', 'StateHoliday', 'StoreType', 'Assortment']

In [None]:
test_inputs = merged_test_df[input_cols].copy()

In [None]:
#For handling Missing Data
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean').fit(train_inputs[numeric_cols])

In [None]:
train_inputs[numeric_cols] = imputer.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols] = imputer.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = imputer.transform(test_inputs[numeric_cols])

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler().fit(train_inputs[numeric_cols])

In [None]:
train_inputs[numeric_cols] = scaler.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols] = scaler.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore').fit(train_inputs[categorical_cols])
encoded_cols = list(encoder.get_feature_names_out(categorical_cols))

In [None]:
train_inputs[encoded_cols] = encoder.transform(train_inputs[categorical_cols])
val_inputs[encoded_cols] = encoder.transform(val_inputs[categorical_cols])
test_inputs[encoded_cols] = encoder.transform(test_inputs[categorical_cols])

In [None]:
train_inputs

In [None]:
X_train = train_inputs[numeric_cols + encoded_cols]
X_val = val_inputs[numeric_cols + encoded_cols]
X_test = test_inputs[numeric_cols + encoded_cols]

In [None]:
X_train

In [None]:
def return_mean(inputs):
    return np.full(len(inputs), merged_df.Sales.mean())

In [None]:
train_preds = return_mean(X_train)

In [None]:
train_preds

In [None]:
from sklearn.metrics import root_mean_squared_error
root_mean_squared_error(return_mean(X_val), val_targets)

In [None]:
root_mean_squared_error(train_preds, train_targets)

In [None]:
def guess_random(inputs):
    lo, hi = merged_df.Sales.min(), merged_df.Sales.max()
    return np.random.random(len(inputs)) * (hi - lo) + lo

In [None]:
train_preds = guess_random(X_train)
train_preds

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
linreg = LinearRegression()

In [None]:
linreg.fit(X_train, train_targets)

In [None]:
train_preds = linreg.predict(X_train)
train_preds

In [None]:
root_mean_squared_error(train_preds, train_targets)

In [None]:
val_preds = linreg.predict(X_val)
val_preds

In [None]:
root_mean_squared_error(val_preds, val_targets)

In [None]:
def try_model(model):
    model.fit(X_train, train_targets)

    train_preds = model.predict(X_train)
    val_preds = model.predict(X_val)

    train_rsme = root_mean_squared_error(train_targets, train_preds)
    val_rsme = root_mean_squared_error(val_targets, val_preds)
    return train_rsme, val_rsme

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor

In [None]:
try_model(LinearRegression())

In [None]:
try_model(Ridge())

In [None]:
try_model(Lasso())

In [None]:
try_model(SGDRegressor())

In [None]:
#TREE based model
from sklearn.tree import DecisionTreeRegressor, plot_tree

In [None]:
tree = DecisionTreeRegressor(random_state=42)
try_model(tree)

In [None]:
plt.figure(figsize=(40, 20))
plot_tree(tree, max_depth=3, filled=True, feature_names=numeric_cols+encoded_cols);

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
%%time
rf = RandomForestRegressor(random_state=42, n_jobs=-1)
try_model(rf)

In [None]:
rf.feature_importances_

In [None]:
importance_df = pd.DataFrame({ 
    'feature': numeric_cols+encoded_cols, 
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)
importance_df.head(10)

In [None]:
sns.barplot(data=importance_df.head(10), x='importance', y='feature');

In [None]:
def predict_input(model, single_input):
    if single_input['Open'] == 0:
        return 0.
    input_df = pd.DataFrame([single_input])
    input_df['Date'] = pd.to_datetime(input_df.Date)
    input_df['Day'] = input_df.Date.dt.day
    input_df['Month'] = input_df.Date.dt.month
    input_df['Year'] = input_df.Date.dt.year
    input_df[numeric_cols] = imputer.transform(input_df[numeric_cols])
    input_df[numeric_cols] = scaler.transform(input_df[numeric_cols])
    input_df[encoded_cols] = encoder.transform(input_df[categorical_cols])
    X_input = input_df[numeric_cols + encoded_cols]
    pred = model.predict(X_input)[0]
    return pred

In [None]:
sample_input = {'Id': 1,
 'Store': 1,
 'DayOfWeek': 4,
 'Date': '2015-09-17 00:00:00',
 'Open': 1.0,
 'Promo': 1,
 'StateHoliday': '0',
 'SchoolHoliday': 0,
 'StoreType': 'c',
 'Assortment': 'a',
 'CompetitionDistance': 1270.0,
 'CompetitionOpenSinceMonth': 9.0,
 'CompetitionOpenSinceYear': 2008.0,
 'Promo2': 0,
 'Promo2SinceWeek': np.nan,
 'Promo2SinceYear': np.nan,
 'PromoInterval': np.nan}

sample_input

In [None]:
predict_input(rf, sample_input)