# Data Pre-Processing

In [1]:
import sys
import os
import pandas as pd
sys.path.append('../scripts')
import warnings
warnings.filterwarnings('ignore')
from data_loader import *
from data_cleaner import *

## Load Datasets

In [2]:
# path to the CSV file
filename1 = 'macro_economic.csv'
filename2 = 'events_holidays_data.csv'
filename3 = 'weather_data.xlsx'
filename4 = 'train.csv'

path1 = os.path.join('..', 'data/raw', filename1)
path2 = os.path.join('..', 'data/raw', filename2)
path3 = os.path.join('..', 'data/raw', filename3)
path4 = os.path.join('..', 'data/raw', filename4)
# Load dataset
macro_economic_df = load_data(path1)
events_holidays_df = load_data(path2)
weather_df = pd.read_excel(path3, sheet_name=None)
train_df = load_data(path4)

## Explore Datasets

### Macro Economic Data

In [3]:
macro_economic_df.head()

Unnamed: 0,Year-Month,Monthly Nominal GDP Index (inMillion$),Monthly Real GDP Index (inMillion$),CPI,PartyInPower,unemployment rate,CommercialBankInterestRateonCreditCardPlans,"Finance Rate on Personal Loans at Commercial Banks, 24 Month Loan",Earnings or wages in dollars per hour,AdvertisingExpenses (in Thousand Dollars),Cotton Monthly Price - US cents per Pound(lbs),Change(in%),Average upland planted(million acres),Average upland harvested(million acres),yieldperharvested acre,Production (in 480-lb netweright in million bales),Mill use (in 480-lb netweright in million bales),Exports
0,2009 - Jan,14421.753,14407.053,233.402,Democrats,7.8,12.03,11.44,22.05,137,57.7,4.02,9.296,7.559,799,12.589,4.17,11.55
1,2009 - Feb,14389.2,14366.177,234.663,Democrats,8.3,12.97,11.05,22.22,200,55.21,-4.32,9.296,7.559,799,12.589,3.87,11.1
2,2009 - Mar,14340.702,14351.787,235.067,Democrats,8.7,12.97,11.05,22.22,?,51.5,-6.72,9.296,7.559,799,12.589,3.72,11.65
3,2009 - Apr,14326.816,14351.602,235.582,Democrats,9.0,12.97,11.05,22.13,214,56.78,10.25,9.296,7.559,787,12.4,3.62,12.225
4,2009 - May,14345.905,14368.124,235.975,Democrats,9.4,13.32,11.25,22.04,?,61.95,9.11,9.297,7.4,803,12.384,3.52,12.3


### Events Holidays Data

In [4]:
events_holidays_df.head()

Unnamed: 0,Year,MonthDate,Event,DayCategory
0,2009,Jan-01,New Year's Day,Federal Holiday
1,2009,Jan-19,Martin Luther King Jr. Day,Federal Holiday
2,2009,Feb-14,Valentine's Day,Event
3,2009,Feb-16,Presidents' Day,Federal Holiday
4,2009,Apr-12,Easter Sunday,Event


### Weather Data

In [5]:
weather_df = pd.concat(weather_df, ignore_index=True)

In [6]:
weather_df

Unnamed: 0,Year,Month,Day,Temp high (°C),Temp avg (°C),Temp low (°C),Dew Point high (°C),Dew Point avg (°C),Dew Point low (°C),Humidity (%) high,...,Sea Level Press. (hPa) avg,Sea Level Press. (hPa) low,Visibility (km) high,Visibility (km) avg,Visibility (km) low,Wind (km/h) low,Wind (km/h) avg,Wind (km/h) high,Precip. (mm) sum,WeatherEvent
0,2009,Jan,1,-3,-6,-9,-16,-17,-19,54,...,1023,1015,16,16,16,37,18,60,0,
1,2009,Jan,2,1,-2,-5,-3,-7,-17,78,...,1012,1007,16,13,2,27,10,48,T,Snow
2,2009,Jan,3,3,1,-2,-5,-9,-13,72,...,1015,1008,16,16,16,27,16,42,T,
3,2009,Jan,4,6,1,-4,-10,-12,-13,55,...,1017,1015,16,16,16,32,12,40,0,
4,2009,Jan,5,6,5,3,-1,-5,-16,62,...,1014,1013,16,16,16,23,11,34,T,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2916,2016,Dec,27,16,10,4,11,6,-3,89,...,1012,1008,16,16,14,32,12,53,0,
2917,2016,Dec,28,4,3,1,-3,-6,-7,64,...,1019,1014,16,16,16,23,8,34,0,
2918,2016,Dec,29,8,4,1,7,2,-7,96,...,1006,1000,16,10,2,24,9,40,9.91,Rain
2919,2016,Dec,30,4,3,1,-1,-4,-7,82,...,1006,1000,16,14,1,29,15,47,0.25,"Fog , Snow"


### Train Data

In [7]:
train_df.head()

Unnamed: 0,Year,Month,ProductCategory,Sales(In ThousandDollars)
0,2009,1,WomenClothing,1755.0
1,2009,1,MenClothing,524.0
2,2009,1,OtherClothing,936.0
3,2009,2,WomenClothing,1729.0
4,2009,2,MenClothing,496.0


### Pre-Processing

In [8]:
weather_df['WeatherEvent'].fillna(0, inplace=True)
weather_df['WeatherEvent'].replace(to_replace=r'^[A-Za-z]', value=1, regex=True, inplace=True)
weather_df['WeatherEvent'].astype('int64')
weather_df.columns = ['Year', 'Month', 'Day', 'Temp high (°C)', 'Temp avg (°C)',
       'Temp low (°C)', 'Dew Point high (°C)', 'Dew Point avg (°C)',
       'Dew Point low (°C)', 'Humidity (%) high', 'Humidity (%) avg',
       'Humidity (%) low', 'Sea Level Press. (hPa) high',
       'Sea Level Press. (hPa) avg', 'Sea Level Press. (hPa) low',
       'Visibility (km) high', 'Visibility (km) avg', 'Visibility (km) low',
       'Wind (km/h) low', 'Wind (km/h) avg', 'Wind (km/h) high',
       'Precip. (mm) sum', 'WeatherEvent']
month_mapping = {
    'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
    'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
}

weather_df['Month'] = weather_df['Month'].map(month_mapping)
weather_df['date'] = pd.to_datetime(weather_df[['Year', 'Month']].assign(day=1))
event = weather_df.groupby(['date'])['WeatherEvent'].sum()
weather_df = weather_df[['date', 'Temp avg (°C)', 'Dew Point avg (°C)', 'Humidity (%) avg', 'Sea Level Press. (hPa) avg', 'Visibility (km) avg', 'Wind (km/h) avg']]
weather_df['Temp avg (°C)'] = weather_df['Temp avg (°C)'].apply(lambda x: float(x) if x!='-' else 0)
weather_df['Dew Point avg (°C)'] = weather_df['Dew Point avg (°C)'].apply(lambda x: float(x) if x!='-' else 0)
weather_df['Humidity (%) avg'] = weather_df['Humidity (%) avg'].apply(lambda x: float(x) if x!='-' else 0)
weather_df['Sea Level Press. (hPa) avg'] = weather_df['Sea Level Press. (hPa) avg'].apply(lambda x: float(x) if x!='-' else 0)
weather_df['Visibility (km) avg'] = weather_df['Visibility (km) avg'].apply(lambda x: float(x) if x!='-' else 0)
weather_df['Wind (km/h) avg'] = weather_df['Wind (km/h) avg'].apply(lambda x: float(x) if x!='-' else 0)
weather_final = weather_df.groupby(['date']).mean()
weather_final['WeatherEvent'] = event

In [9]:
weather_final 

Unnamed: 0_level_0,Temp avg (°C),Dew Point avg (°C),Humidity (%) avg,Sea Level Press. (hPa) avg,Visibility (km) avg,Wind (km/h) avg,WeatherEvent
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2009-01-01,-2.096774,-9.903226,57.354839,1015.967742,14.000000,12.000000,10
2009-02-01,2.785714,-6.392857,54.000000,1017.071429,15.178571,12.214286,6
2009-03-01,5.967742,-2.903226,57.064516,1021.064516,14.258065,11.387097,10
2009-04-01,12.733333,2.533333,55.700000,1014.533333,14.066667,11.366667,12
2009-05-01,17.000000,10.096774,68.516129,1017.161290,12.451613,7.483871,14
...,...,...,...,...,...,...,...
2016-08-01,26.354839,18.354839,64.709677,951.774194,13.193548,5.548387,10
2016-09-01,22.133333,14.566667,65.033333,1019.100000,14.066667,7.833333,7
2016-10-01,14.967742,8.516129,67.258065,1019.483871,13.838710,7.419355,6
2016-11-01,9.866667,1.600000,57.733333,1017.000000,14.366667,8.400000,8


In [10]:
find_missing_values(weather_final )

Temp avg (°C)                 0
Dew Point avg (°C)            0
Humidity (%) avg              0
Sea Level Press. (hPa) avg    0
Visibility (km) avg           0
Wind (km/h) avg               0
WeatherEvent                  0
dtype: int64

In [11]:
find_missing_values(events_holidays_df)

Year           0
MonthDate      0
Event          0
DayCategory    0
dtype: int64

In [12]:
find_missing_values(macro_economic_df)

Year-Month                                                           0
Monthly Nominal GDP Index (inMillion$)                               0
Monthly Real GDP Index (inMillion$)                                  0
CPI                                                                  0
PartyInPower                                                         0
unemployment rate                                                    0
CommercialBankInterestRateonCreditCardPlans                          0
Finance Rate on Personal Loans at Commercial Banks, 24 Month Loan    0
Earnings or wages  in dollars per hour                               0
AdvertisingExpenses (in Thousand Dollars)                            0
Cotton Monthly Price - US cents per Pound(lbs)                       0
Change(in%)                                                          0
Average upland planted(million acres)                                0
Average upland harvested(million acres)                              0
yieldp

In [13]:
# Define output folder and file name
output_folder = os.path.join('..', 'data', 'processed')
filename1 = 'weather_final.parquet'

output_path = save_data(weather_final, output_folder, filename1)

filename2 = 'events_holidays.parquet'

output_path = save_data(events_holidays_df, output_folder, filename2)

filename3 = 'macro_economic.parquet'

output_path = save_data(macro_economic_df, output_folder, filename3)

filename4 = 'train.parquet'

output_path = save_data(train_df, output_folder, filename4)

Dataset saved to ..\data\processed\weather_final.parquet
Dataset saved to ..\data\processed\events_holidays.parquet
Dataset saved to ..\data\processed\macro_economic.parquet
Dataset saved to ..\data\processed\train.parquet
