In [4]:
import os
import sys
from IPython.display import Markdown, display, Image
import numpy as np
import pandas as pd
import random
import math
import dvc.api
sys.path.append(os.path.abspath(os.path.join('../scripts')))


In [5]:
import warnings
warnings.filterwarnings('ignore')

In [9]:

import scipy.stats as scs

from app_logger import App_Logger

In [11]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import Normalizer, MinMaxScaler, StandardScaler
import pickle
import dvc.api
from app_logger import App_Logger


app_logger = App_Logger("helper.log").get_app_logger()


class Helper:

    def __init__(self):
        self.logger = App_Logger("helper.log").get_app_logger()

    def read_model(self, file_name):
        with open(f"../models/{file_name}.pkl", "rb") as f:
            self.logger.info(f"Model loaded from {file_name}.pkl")
            return pickle.load(f)

    def write_model(self, file_name, model):
        with open(f"../models/{file_name}.pkl", "wb") as f:
            self.logger.info(f"Model dumped to {file_name}.pkl")
            pickle.dump(model, f)

    def read_csv(self, csv_path, missing_values=[]):
        try:
            df = pd.read_csv(csv_path, na_values=missing_values)
            print("file read as csv")
            self.logger.info(f"file read as csv from {csv_path}")
            return df
        except FileNotFoundError:
            print("file not found")
            self.logger.error(f"file not found, path:{csv_path}")

    def save_csv(self, df, csv_path):
        try:
            df.to_csv(csv_path, index=False)
            print('File Successfully Saved.!!!')
            self.logger.info(f"File Successfully Saved to {csv_path}")

        except Exception:
            print("Save failed...")
            self.logger.error(f"saving failed")

        return df

    def get_data(tag, path='data/data.csv', repo='https://github.com/Desmondonam/sales-prediction'):
        rev = tag
        data_url = dvc.api.get_url(path=path, repo=repo, rev=rev)
        df = pd.read_csv(data_url)
        app_logger.info(f"Read data from {path}, version {tag}")

        return df

    def percent_missing(self, df: pd.DataFrame) -> float:

        totalCells = np.product(df.shape)
        missingCount = df.isnull().sum()
        totalMissing = missingCount.sum()
        return round((totalMissing / totalCells) * 100, 2)

    def percent_missing_for_col(self, df: pd.DataFrame, col_name: str) -> float:
        total_count = len(df[col_name])
        if total_count <= 0:
            return 0.0
        missing_count = df[col_name].isnull().sum()

        return round((missing_count / total_count) * 100, 2)

    def normalizer(self, df, columns):
        norm = Normalizer()
        return pd.DataFrame(norm.fit_transform(df), columns=columns)

    def scaler(self, df, columns, mode="minmax"):
        if (mode == "minmax"):
            minmax_scaler = MinMaxScaler()
            return pd.DataFrame(minmax_scaler.fit_transform(df), columns=columns)
        elif (mode == "standard"):
            scaler = StandardScaler()
            return pd.DataFrame(scaler.fit_transform(df), columns=columns)

    def scale_and_normalize(self, df, columns, sclaer_mode="minmax"):
        return self.normalizer(self.scaler(df, columns, sclaer_mode), columns)

In [12]:
helper = Helper()

In [13]:
train_df = helper.read_csv("../data/train.csv")
store_df = helper.read_csv("../data/store.csv")
test_df = helper.read_csv("../data/test.csv")

file read as csv
file read as csv
file read as csv


In [14]:
class CleanAndTransformTrainData:
    """
        this is a dataframe used to clean train dataframe
    """
    def __init__(self):
        pass
    
    def drop_closed_stores(self, df):
        
        try:
            cleaned = df.query("Open == 1")
            return cleaned
        except:
            pass
        
    def convert_to_datatime(self, df):  
        try:
            df['Date'] = pd.to_datetime(train_df['Date'])
            return df
        except:
            pass
    
    def sort_by_date(self, df):
        return df.sort_values(by=["Date"], ascending=False)
            
    
    def to_str(self, df):
        df['StateHoliday'] = df['StateHoliday'].astype(str)
        return df
    
    
    """Extracts Day Month and Year from Date"""
    def transform_date(self, df):
        
        df['Date'] = pd.to_datetime(df['Date'])
        df['Year'] = pd.DatetimeIndex(df['Date']).year
        df['Month'] = pd.DatetimeIndex(df['Date']).month
        df['Day'] = pd.DatetimeIndex(df['Date']).day
        df['DayInMonth'] = df['Day'].apply(lambda x: self.to_month_category(x))
        return df
    
    """add called weekends """
    def add_weekday_col(self, df):
      
        df["Weekends"] = df["DayOfWeek"].apply(lambda x: 1 if x > 5 else 0)
        return df
    
    def to_month_category(self, value):
        try:
            
            if (value >= 1 and int(value) < 10):
                return "BegMonth"

            elif (value >= 10 and value < 20):
                return "MidMonth"
            else:
                return "EndMonth"
        except:
            pass
        
    def get_cleaned(self, df):
        df = self.drop_closed_stores(df)
        df = self.to_str(df)
        df = self.convert_to_datatime(df)
        df = self.transform_date(df)
        df = self.add_weekday_col(df)
#         df = self.to_month_category(df)
        return df

In [15]:
class cleanStoreDf:
    """ This is a class to clean store df"""
    
    def __init__(self):
        pass
    
    def handle_missing_value(self, df):
        """We handled CompetitionDistance by replacing it with median"""
        
        
        df['CompetitionDistance'] = df['CompetitionDistance'].fillna(df['CompetitionDistance'].max())
        df['Promo2SinceWeek'] = df['Promo2SinceWeek'].fillna(df['Promo2SinceWeek'].max())
        df['Promo2SinceYear'] = df['Promo2SinceYear'].fillna(df['Promo2SinceWeek'].max())
        df['PromoInterval'] = df['PromoInterval'].fillna(df['PromoInterval'].mode()[0])
        df['CompetitionOpenSinceYear'] = df['CompetitionOpenSinceYear'].fillna(df['CompetitionOpenSinceYear'].mode()[0])
        df['CompetitionOpenSinceMonth'] = df['CompetitionOpenSinceMonth'].fillna(df['CompetitionOpenSinceMonth'].mode()[0])
        
        return df
    def get_cleaned(self, df):
        return self.handle_missing_value(df)

In [16]:
def preprocess_data(self):
    
    pass

In [17]:

clean_train_df = CleanAndTransformTrainData().get_cleaned(train_df)
clean_store_df = cleanStoreDf().get_cleaned(store_df)