In [None]:
#importing the libraries
import numpy as np
import pandas as pd
from scipy import stats
import datetime 
import plotly.express as px
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing
import plotly.graph_objects as go
import warnings
warnings.filterwarnings("ignore")

In [None]:
# %load ('../scripts/data_cleaner.py')
%run ../scripts/data_cleaner.py

In [None]:
#Adding scripts path
import sys
sys.path.insert(0,'../scripts/')
from data_preProcessing import data_preProcessing_script
from data_cleaner import DataCleaner

In [None]:
store_data = pd.read_csv('../data/store.csv')
train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')

In [None]:
store_data.head()

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
preprocess_store = data_preProcessing_script(store_data)
preprocess_store.show_data_information()

In [None]:
preprocess_test = data_preProcessing_script(test_data)
preprocess_test.show_data_information()

In [None]:
preprocess_train = data_preProcessing_script(train_data)
preprocess_train.show_data_information()

# Missing Value Manipulation

In [None]:
preprocess_store.colums_WithMissingValue()

In [None]:
preprocess_train.colums_WithMissingValue()

In [None]:
preprocess_test.colums_WithMissingValue()

## Fixing missing values

In [None]:
preprocess_test.get_column_based_missing_percentage()

In [None]:
test_data['Open'].fillna(test_data['Open'].median(), inplace = True)

In [None]:
preprocess_train.get_column_based_missing_percentage()

In [None]:
preprocess_store.get_column_based_missing_percentage()

In [None]:
store_data['CompetitionDistance'].fillna(store_data['CompetitionDistance'].median(), inplace = True)
store_data.Promo2SinceWeek.fillna(0,inplace=True)
store_data.Promo2SinceYear.fillna(0,inplace=True)
store_data.PromoInterval.fillna(0,inplace=True)
store_data.CompetitionOpenSinceMonth.fillna(0, inplace = True)
store_data.CompetitionOpenSinceYear.fillna(0,inplace=True)

In [None]:
preprocess_store.get_column_based_missing_percentage()

In [None]:
preprocess_test.get_column_based_missing_percentage()

In [None]:
preprocess_train.get_column_based_missing_percentage()

# Data Wrangling

In [None]:
preprocess_store.show_datatypes()

In [None]:
preprocess_train.show_datatypes()

In [None]:
preprocess_test.show_datatypes()

In [None]:
train_data.Date = pd.to_datetime(train_data["Date"])
test_data.Date = pd.to_datetime(test_data["Date"])

In [None]:
preprocess_train.show_datatypes()

In [None]:
preprocess_test.show_datatypes()

In [None]:
train_data_cleaner = DataCleaner(train_data)
test_data_cleaner = DataCleaner(test_data)
store_data_cleaner = DataCleaner(store_data)

In [None]:
train_data_cleaner.separate_date_column(date_column='Date')
test_data_cleaner.separate_date_column(date_column='Date')

In [None]:
#Knowing the shapes of our datasets
print('Shapes of our datasets')
print('-----------------------')
print(f'Training dataset:{train_data.shape}')
print(f'Testing dataset:{test_data.shape}')
print(f'Store dataset:{store_data.shape}')

# Outliers (test data)

In [None]:
from data_exploration import exploration

In [None]:
exploration.plot_box(test_data, 'Store', 'Outliers')

In [None]:
exploration.plot_box(test_data, 'DayOfWeek', 'Outliers')

# Outliers (train data)

In [None]:
exploration.plot_box(train_data, 'Store', 'Outliers')

In [None]:
exploration.plot_box(train_data, 'DayOfWeek', 'Outliers')

In [None]:
exploration.plot_box(train_data, 'Sales', 'Outliers')

# Outliers (store data)

In [None]:
exploration.plot_box(store_data, 'Store', 'Outliers')

In [None]:
exploration.plot_box(store_data, 'CompetitionDistance', 'Outliers')

In [None]:
exploration.plot_box(store_data, 'Promo2', 'Outliers')

In [None]:
exploration.plot_box(store_data, 'CompetitionOpenSinceMonth', 'Outliers')

# Fixing Outliers

In [None]:
train_data_cleaner.replace_outlier_with_median(train_data, 'Sales')

In [None]:
train_data_cleaner.replace_outlier_with_median(store_data, 'CompetitionDistance')
train_data_cleaner.replace_outlier_with_median(store_data, 'CompetitionOpenSinceMonth')

# Saving new File

In [None]:
# Merging the training dataset with the store dataset 
train_store_data=pd.merge(train_data, store_data, on='Store', how='inner')

In [None]:
# Merging the test dataset with the store dataset
test_store_data=pd.merge(test_data, store_data, on='Store', how='inner')

In [None]:
train_store_data.to_csv('../data/train_store.csv', index=False)
test_store_data.to_csv('../data/test_store.csv', index=False)
# 
# 
print('''
#############################################
Clean Data Saved !
#############################################
''')