In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
pd.set_option('display.max_columns', None)
from rohlik_forecasting.utils import main_utils as mu


In [None]:
os.chdir('../')
%pwd

In [3]:
df_train = pd.read_csv('./artifacts/data_ingestion/train.csv')
df_test = pd.read_csv('./artifacts/data_ingestion/test.csv')

In [None]:
df_train.head()


In [None]:
df_test.columns

In [None]:
df_train = df_train[['warehouse','orders','date','holiday', 'shops_closed',
       'winter_school_holidays', 'school_holidays', 'id']]
df_train.head()

#### Types of Features

In [None]:
num_features = [features for features in df_train.columns if df_train[features].dtypes != 'O']
print('Total numerical features: ', len(num_features))

In [None]:
cat_features = [features for features in df_train.columns if df_train[features].dtypes == 'O']  
print('Total categorical features: ', len(cat_features))

In [None]:
discrete_features = [features for features in num_features if len(df_train[features].unique())<25]  
print('Total discrete features: ', len(discrete_features))  

In [None]:
continuous_features = [features for features in num_features if features not in discrete_features]  
print('Total continuous features: ', len(continuous_features))

In [11]:
X = df_train.drop('orders', axis=1)
y = df_train['orders']  

In [None]:
fig, ax = plt.subplots(figsize=(15,5))
fig.suptitle(f'Distribution of Orders By warehouse',fontsize=20, fontweight='bold', ha='center')
sns.histplot(data=df_train[df_train['orders']<12501], x='orders', bins=30, kde=True, 
                multiple='stack',ax=ax)
fig.tight_layout()
fig.subplots_adjust(top=0.90)
plt.xticks(rotation=90);

In [18]:
X= df_train.drop(columns=['orders','id','date'], axis=1)

In [14]:
y = df_train['orders']

In [None]:
X

### Preprocessing using ColumnTransformer

In [None]:
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns
print('Numerical features: ', num_features)
print('Categorical features: ', cat_features)   

In [20]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
        ("StandardScaler", numeric_transformer, num_features)
    ]
)


In [21]:
X= preprocessor.fit_transform(X)

In [None]:
X

#### Creating evaluation functions

In [1]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
#from catboost import CatBoostRegressor
from xgboost import XGBRegressor

In [25]:
def evaluate_model(true,predicted):
    mse = mean_squared_error(true, predicted)
    mae = mean_absolute_error(true, predicted)
    rmse = np.sqrt(mse)
    r2 = r2_score(true, predicted)
    return mse, mae, rmse,r2

In [2]:
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "K-Nearest Neighbors": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(),
    #"CatBoostingRegressor": CatBoostRegressor(verbose=False),
    "AdaBoostRegressor": AdaBoostRegressor(),
}

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_list = []
r2_list = []

# Training the model
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)
# Predicting the model   
    y_train_pred = model.predict(X_train)   
    y_test_pred = model.predict(X_test)
    
    #evaluating the model
    mse_train, mae_train, rmse_train, r2_train = evaluate_model(y_train, y_train_pred)
    mse_test, mae_test, rmse_test, r2_test = evaluate_model(y_test, y_test_pred)
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(mse_train))
    print("- Root Mean Squared Error: {:.4f}".format(rmse_train))
    print("- Mean Absolute Error: {:.4f}".format(mae_train))
    print("- R2 Score: {:.4f}".format(r2_train))

    print('----------------------------------')
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(mse_test))
    print("- Root Mean Squared Error: {:.4f}".format(rmse_test))
    print("- Mean Absolute Error: {:.4f}".format(mae_test))
    print("- R2 Score: {:.4f}".format(r2_test))
    
    print('='*35)
    print('\n')
    
    


    
