In [2]:
import json
import pandas as pd
import numpy as np
import re
import psycopg2
from config import db_password
from sqlalchemy import create_engine
import time
from pathlib import Path
import datetime

In [None]:
# Load the data
file_path = Path('./Resources/Walmart_Store_sales.csv')
Wkly_Sales_df = pd.read_csv(file_path)
Wkly_Sales_df.head()

In [None]:
Wkly_Sales_df.describe()

In [None]:
Wkly_Sales_df.info()

# ETL Function

In [None]:
def clean_sales(Wkly_Sales_df):
        # step 1: Removing columns where 90% of values is null   
        try:
            sales_columns_to_keep = [column for column in Wkly_Sales_df.columns if Wkly_Sales_df[column].isnull().sum() < len(Wkly_Sales_df) * 0.9]
            Rev_wkly_sales_df = Wkly_Sales_df[sales_columns_to_keep]
        except Exception as e:
            print('check error-for Null Columns')
            print(e.message)
        
        #Step 2: Converting date datype from object to date
        try:
            Rev_wkly_sales_df['Date'] = pd.to_datetime(Rev_wkly_sales_df['Date'])
            Rev_wkly_sales_df['Rev_Date'] = pd.to_datetime(Rev_wkly_sales_df['Date'])
        except Exception as e:
            print('check error-for Parse Date converstion')   
            print(e.message)
        
        #3 try to concatenate two columns for unique id - store and date then remove duplicates
        try:
            Rev_wkly_sales_df['index_id'] = Rev_wkly_sales_df['Store'].astype(str)+'-'+ Rev_wkly_sales_df['Date'].astype(str)
            Rev_wkly_sales_df.drop_duplicates(subset='index_id', inplace=True)
        except Exception as e:
            print('check error-for duplicate rows')
            print(e.message)
            
        # 4 Create new columns
        try:   
            Rev_wkly_sales_df['Month']=pd.DatetimeIndex(Rev_wkly_sales_df['Date']).month
            Rev_wkly_sales_df['Year']=pd.DatetimeIndex(Rev_wkly_sales_df['Date']).year
            Rev_wkly_sales_df['Week']=pd.DatetimeIndex(Rev_wkly_sales_df['Date']).weekofyear
        except Exception as e:
            print('check error-for Appending Columns')
            print(e.message) 
                      
         #5 Drop any null rows   
            Rev_wkly_sales_df=Rev_wkly_sales_df.dropna()
        
        return Rev_wkly_sales_df

In [None]:
weekly_sales_df=clean_sales(Wkly_Sales_df)
weekly_sales_df.head()

In [None]:
weekly_sales_df.info()

In [None]:
features_df = pd.DataFrame(data=weekly_sales_df, columns = ['index_id', 'Store','Date','Temperature','Fuel_Price','CPI','Unemployment'])
features_df.head()

In [None]:
Holidays_df = pd.DataFrame(data=weekly_sales_df, columns = ['Date','Holiday_Flag'])
Holidays_df = Holidays_df[Holidays_df.Holiday_Flag != 0]
Holidays_df.drop_duplicates(subset='Date', inplace=True)
Holidays_df.head()

In [None]:
db_string = f"postgres://postgres:{db_password}@127.0.0.1:5432/WMT_Sales_Data"
engine = create_engine(db_string)
weekly_sales_df.to_sql(name='Weekly_Sales', con=engine, if_exists='append',index=False)

In [None]:
features_df.to_sql(name='Features', con=engine, if_exists='append',index=False)

In [None]:
Holidays_df.to_sql(name='Holidays', con=engine, if_exists='append',index=False)

In [None]:
weekly_sales_df.to_csv (r'C:\Users\vick_\Desktop\Data Analytics Projects\final_project\WMT_Weekly_Sales_rev.csv', index = False, header=True)