In [None]:
!pip install jovian opendatasets xgboost graphviz lightgbm scikit_learn xgboost lightgbm --upgrade

In [None]:
'''
Rossmann operates over 3,000 drug stores in 7 European countries. Currently, 
Rossmann store managers are tasked with predicting their daily sales for up to six weeks in advance.
Store sales are influenced by many factors, including promotions, competition, school and state holidays,
seasonality, and locality. With thousands of individual managers predicting sales based on their unique circumstances,
the accuracy of results can be quite varied.

In their first Kaggle competition, Rossmann is challenging you to predict 6 weeks of daily sales for 1,115 stores
located across Germany. Reliable sales forecasts enable store managers to create effective staff schedules
that increase productivity and motivation. By helping Rossmann create a robust prediction model, 
you will help store managers stay focused on what’s most important to them: their customers and their teams! 
'''

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
import os
import opendatasets as od
import pandas as pd
pd.set_option("display.max_columns", 120)
pd.set_option("display.max_rows", 120)

In [None]:
ross_df = pd.read_csv(r"C:\Users\DELL\Downloads\rossmann-store-sales\train.csv", low_memory=False)
store_df = pd.read_csv(r"C:\Users\DELL\Downloads\rossmann-store-sales\store.csv")
test_df = pd.read_csv(r"C:\Users\DELL\Downloads\rossmann-store-sales\test.csv")
submission_df = pd.read_csv(r"C:\Users\DELL\Downloads\rossmann-store-sales\sample_submission.csv")

In [None]:
ross_df

In [None]:
store_df

In [None]:
test_df

In [None]:
submission_df

In [None]:
merged_df = ross_df.merge(store_df,how='left',on='Store')
merged_test_df = test_df.merge(store_df,how='left', on='Store')

In [None]:
merged_df

In [None]:
merged_df.info()

In [None]:
def split_date(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df['month'] = df.Date.dt.month
    df['year'] = df.Date.dt.year
    df['day'] = df.Date.dt.day
    df['WeekOfYear'] = df.Date.dt.isocalendar().week

In [None]:
split_date(merged_df)
split_date(merged_test_df)

In [None]:
merged_df

In [None]:
#store open/close

In [None]:
merged_df[merged_df.Open == 0].Sales.value_counts()

In [None]:
merged_df = merged_df[merged_df.Open==1].copy()

In [None]:
merged_df

In [None]:
#compitition

In [None]:
def comp_months(df):
    df['CompetitionOpen'] = 12 * (df.year - df.CompetitionOpenSinceYear) + (df.month - df.CompetitionOpenSinceMonth)
    df['CompetitionOpen'] = df['CompetitionOpen'].map(lambda x: 0 if x<0 else x).fillna(0)

In [None]:
comp_months(merged_df)
comp_months(merged_test_df)

In [None]:
merged_df

In [None]:
def check_promo_month(row):
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun',              
                 7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    try:
        months = (row['PromoInterval'] or '').split(',')
        if row['Promo2Open'] and month2str[row['month']] in months:
            return 1
        else:
            return 0
    except Exception:
        return 0

def promo_cols(df):
    # Months since Promo2 was open
    df['Promo2SinceYear'] = df['Promo2SinceYear'].fillna(0).astype(int)
    df['Promo2SinceWeek'] = df['Promo2SinceWeek'].fillna(0).astype(int)
    df['Promo2'] = df['Promo2'].fillna(0).astype(int)
    df['Promo2Open'] = 12 * (df.year - df.Promo2SinceYear) +  (df.WeekOfYear - df.Promo2SinceWeek)*7/30.5
    df['Promo2Open'] = df['Promo2Open'].map(lambda x: 0 if x < 0 else x).fillna(0) * df['Promo2']
    # Whether a new round of promotions was started in the current month
    df['IsPromo2Month'] = df.apply(check_promo_month, axis=1) * df['Promo2']

In [None]:
promo_cols(merged_df)
promo_cols(merged_test_df)

In [None]:
merged_df[['Date', 'Promo2', 'Promo2SinceYear', 'Promo2SinceWeek', 'PromoInterval', 'Promo2Open', 'IsPromo2Month']].sample(20)

In [None]:
input_cols = ['Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday', 
              'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpen', 
              'day', 'month', 'year', 'WeekOfYear',  'Promo2', 
              'Promo2Open', 'IsPromo2Month']
target_col = 'Sales'

In [None]:
inputs = merged_df[input_cols].copy()
targets = merged_df[target_col].copy()

In [None]:
inputs

In [None]:
test_inputs = merged_test_df[input_cols].copy()

In [None]:
test_inputs

In [None]:
numeric_cols = ['Store', 'Promo', 'SchoolHoliday', 
              'CompetitionDistance', 'CompetitionOpen', 'Promo2', 'Promo2Open', 'IsPromo2Month',
              'day', 'month', 'year', 'WeekOfYear',  ]
categorical_cols = ['DayOfWeek', 'StateHoliday', 'StoreType', 'Assortment']

In [None]:
inputs[numeric_cols].isna().sum()

In [None]:
test_inputs[numeric_cols].isna().sum()

In [None]:
max_distance = inputs.CompetitionDistance.max()
max_distance

In [None]:
inputs['CompetitionDistance'].fillna(max_distance, inplace=True)
test_inputs['CompetitionDistance'].fillna(max_distance, inplace=True)

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler().fit(inputs[numeric_cols])

In [None]:
inputs[numeric_cols] = scaler.transform(inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore').fit(inputs[categorical_cols])
encoded_cols = list(encoder.get_feature_names_out(categorical_cols))

In [None]:
inputs[encoded_cols] = encoder.transform(inputs[categorical_cols])
test_inputs[encoded_cols] = encoder.transform(test_inputs[categorical_cols])

In [None]:
X = inputs[numeric_cols + encoded_cols]
X_test = test_inputs[numeric_cols + encoded_cols]

In [None]:
#training

In [None]:
from xgboost import XGBRegressor

In [None]:
model = XGBRegressor(random_state=42,n_jobs = -1, n_estimators=20, max_depth=4)

In [None]:
model.fit(X,targets)

In [None]:
preds = model.predict(X)

In [None]:
preds

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
def rmse(a,b):
    a = np.array(a)
    b = np.array(b)
    return np.sqrt(mean_squared_error(a, b)) 

In [None]:
rmse(preds,targets)

In [None]:
merged_df.Sales.min(), merged_df.Sales.max()

In [None]:
plt.hist(merged_df.Sales.sample(10000))

In [None]:
from xgboost import plot_tree
from matplotlib.pylab import rcParams
%matplotlib inline

rcParams['figure.figsize'] = 30,30

In [None]:
!pip uninstall graphviz

In [None]:
!pip install graphviz

In [None]:
plot_tree(model,rankdir='LR', num_trees=0);

In [None]:
plot_tree(model,rankdir='LR',num_trees=1);

In [None]:
plot_tree(model,rankdir='LR',num_trees=19);

In [None]:
trees = model.get_booster().get_dump()

In [None]:
len(trees)