#### Exploratory Data Analysis (EDA) and Data Modelling in Python - The following code shows the process of EDA and data modelling process: 
- Loading data: Imported the datasets using pandas
- Initial Exploration: checked the structure of the dataframes and the uniqueness of keys before joining the two dataframes
- Column Standardization: Renamed columns for consistency and better readability
- Data modelling: Created the customer dimension table and added a surrogate key, created the product dimension table and added a surrogate key, created the date dimension table and added a surrogate key, merge the five tables to create the orders fact table and added a surrogate key
- Saving the tables created: Saved the orders fact table, the customer dimension, the product dimension table, and the date dimension table as CSVs
- Connecting to SQL Server and pushing the tables: Connected to a databased created in SQL Server and pushed the orders fact table, the customer dimension, the product dimension table, and the date dimension table to the database


In [7]:
# Import libararies
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine
import urllib
from urllib.parse import quote_plus
import pypyodbc as odbc  

In [8]:
# Load data
Orders_df = pd.read_csv(r'Online Sales Data\Orders.csv')
Details_df = pd.read_csv(r'Online Sales Data\Details.csv')

In [9]:
Orders_df

Unnamed: 0,Order ID,Order Date,CustomerName,State,City
0,B-26055,10-03-2018,Harivansh,Uttar Pradesh,Mathura
1,B-25993,03-02-2018,Madhav,Delhi,Delhi
2,B-25973,24-01-2018,Madan Mohan,Uttar Pradesh,Mathura
3,B-25923,27-12-2018,Gopal,Maharashtra,Mumbai
4,B-25757,21-08-2018,Vishakha,Madhya Pradesh,Indore
...,...,...,...,...,...
495,B-25742,03-08-2018,Ashwin,Goa,Goa
496,B-26088,26-03-2018,Bhavna,Sikkim,Gangtok
497,B-25707,01-07-2018,Shivani,Maharashtra,Mumbai
498,B-25758,22-08-2018,Shubham,Himachal Pradesh,Simla


In [10]:
Orders_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Order ID      500 non-null    object
 1   Order Date    500 non-null    object
 2   CustomerName  500 non-null    object
 3   State         500 non-null    object
 4   City          500 non-null    object
dtypes: object(5)
memory usage: 19.7+ KB


In [11]:
Details_df

Unnamed: 0,Order ID,Amount,Profit,Quantity,Category,Sub-Category,PaymentMode
0,B-25681,1096,658,7,Electronics,Electronic Games,COD
1,B-26055,5729,64,14,Furniture,Chairs,EMI
2,B-25955,2927,146,8,Furniture,Bookcases,EMI
3,B-26093,2847,712,8,Electronics,Printers,Credit Card
4,B-25602,2617,1151,4,Electronics,Phones,Credit Card
...,...,...,...,...,...,...,...
1495,B-25700,7,-3,2,Clothing,Hankerchief,COD
1496,B-25757,3151,-35,7,Clothing,Trousers,EMI
1497,B-25973,4141,1698,13,Electronics,Printers,COD
1498,B-25698,7,-2,1,Clothing,Hankerchief,COD


In [12]:
Details_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Order ID      1500 non-null   object
 1   Amount        1500 non-null   int64 
 2   Profit        1500 non-null   int64 
 3   Quantity      1500 non-null   int64 
 4   Category      1500 non-null   object
 5   Sub-Category  1500 non-null   object
 6   PaymentMode   1500 non-null   object
dtypes: int64(3), object(4)
memory usage: 82.2+ KB


In [13]:
print(Orders_df['Order ID'].nunique())
print(Details_df['Order ID'].nunique())

500
500


In [14]:
merged_Orders_Details = Orders_df.merge(Details_df, on='Order ID', how='left')
merged_Orders_Details

Unnamed: 0,Order ID,Order Date,CustomerName,State,City,Amount,Profit,Quantity,Category,Sub-Category,PaymentMode
0,B-26055,10-03-2018,Harivansh,Uttar Pradesh,Mathura,5729,64,14,Furniture,Chairs,EMI
1,B-26055,10-03-2018,Harivansh,Uttar Pradesh,Mathura,671,114,9,Electronics,Phones,Credit Card
2,B-26055,10-03-2018,Harivansh,Uttar Pradesh,Mathura,443,11,1,Clothing,Saree,COD
3,B-26055,10-03-2018,Harivansh,Uttar Pradesh,Mathura,57,7,2,Clothing,Shirt,UPI
4,B-26055,10-03-2018,Harivansh,Uttar Pradesh,Mathura,227,48,5,Clothing,Stole,COD
...,...,...,...,...,...,...,...,...,...,...,...
1495,B-25742,03-08-2018,Ashwin,Goa,Goa,11,-8,2,Clothing,Skirt,UPI
1496,B-26088,26-03-2018,Bhavna,Sikkim,Gangtok,11,5,2,Clothing,Hankerchief,UPI
1497,B-25707,01-07-2018,Shivani,Maharashtra,Mumbai,8,-6,1,Clothing,Stole,COD
1498,B-25758,22-08-2018,Shubham,Himachal Pradesh,Simla,8,-2,1,Clothing,Stole,COD


In [15]:
# Rename columns
merged_Orders_Details.rename(columns={'Order ID': 'OrderID', 'Order Date': 'OrderDate', 'Sub-Category': 'SubCategory'}, inplace=True)

merged_Orders_Details

Unnamed: 0,OrderID,OrderDate,CustomerName,State,City,Amount,Profit,Quantity,Category,SubCategory,PaymentMode
0,B-26055,10-03-2018,Harivansh,Uttar Pradesh,Mathura,5729,64,14,Furniture,Chairs,EMI
1,B-26055,10-03-2018,Harivansh,Uttar Pradesh,Mathura,671,114,9,Electronics,Phones,Credit Card
2,B-26055,10-03-2018,Harivansh,Uttar Pradesh,Mathura,443,11,1,Clothing,Saree,COD
3,B-26055,10-03-2018,Harivansh,Uttar Pradesh,Mathura,57,7,2,Clothing,Shirt,UPI
4,B-26055,10-03-2018,Harivansh,Uttar Pradesh,Mathura,227,48,5,Clothing,Stole,COD
...,...,...,...,...,...,...,...,...,...,...,...
1495,B-25742,03-08-2018,Ashwin,Goa,Goa,11,-8,2,Clothing,Skirt,UPI
1496,B-26088,26-03-2018,Bhavna,Sikkim,Gangtok,11,5,2,Clothing,Hankerchief,UPI
1497,B-25707,01-07-2018,Shivani,Maharashtra,Mumbai,8,-6,1,Clothing,Stole,COD
1498,B-25758,22-08-2018,Shubham,Himachal Pradesh,Simla,8,-2,1,Clothing,Stole,COD


In [16]:
# Create customer dimension table
customer_dim = (
    merged_Orders_Details[['CustomerName', 'State', 'City']]
    .drop_duplicates()
    .reset_index(drop=True)
    .reset_index()
    .rename(columns={'index': 'CustomerKey'})
)

# Add a surrogate key starting from 1
customer_dim['CustomerKey'] += 1  

In [17]:
customer_dim

Unnamed: 0,CustomerKey,CustomerName,State,City
0,1,Harivansh,Uttar Pradesh,Mathura
1,2,Madhav,Delhi,Delhi
2,3,Madan Mohan,Uttar Pradesh,Mathura
3,4,Gopal,Maharashtra,Mumbai
4,5,Vishakha,Madhya Pradesh,Indore
...,...,...,...,...
400,401,Hemangi,Delhi,Delhi
401,402,Dinesh,Tamil Nadu,Chennai
402,403,Ashwin,Goa,Goa
403,404,Shivani,Maharashtra,Mumbai


In [18]:
# Create product dimension table
product_dim = (
    merged_Orders_Details[['Category', 'SubCategory']]
    .drop_duplicates()
    .reset_index(drop=True)
    .reset_index()
    .rename(columns={'index': 'ProductKey'})
)

# Add a surrogate key starting from 1
product_dim['ProductKey'] += 1

In [19]:
product_dim

Unnamed: 0,ProductKey,Category,SubCategory
0,1,Furniture,Chairs
1,2,Electronics,Phones
2,3,Clothing,Saree
3,4,Clothing,Shirt
4,5,Clothing,Stole
5,6,Clothing,T-shirt
6,7,Electronics,Printers
7,8,Furniture,Bookcases
8,9,Furniture,Furnishings
9,10,Furniture,Tables


In [20]:
# Create date dimension table

# Create date range
start_date = merged_Orders_Details['OrderDate'].min()
end_date = merged_Orders_Details['OrderDate'].max()
dates = pd.date_range(start=start_date, end=end_date, freq='D')

# Build date_dim dataframe
date_dim = pd.DataFrame(dates, columns=['Date'])
date_dim['Year'] = date_dim['Date'].dt.year
date_dim['Quarter'] = date_dim['Date'].dt.quarter
date_dim['Month'] = date_dim['Date'].dt.month
date_dim['Day'] = date_dim['Date'].dt.day
date_dim['DayOfWeek'] = date_dim['Date'].dt.dayofweek + 1   # Monday=1, Sunday=7
date_dim['DayName'] = date_dim['Date'].dt.day_name()
date_dim['IsWeekend'] = date_dim['DayOfWeek'].isin([6, 7]).astype(int)

# Add a surrogate key
date_dim['DateKey'] = date_dim['Date'].dt.strftime('%Y%m%d').astype(int)

# Change date format to dd/mm/yyyy
date_dim['Date'] = date_dim['Date'].dt.strftime('%d-%m-%Y')

# Reorganize columns
date_dim = date_dim[['DateKey', 'Date', 'Year', 'Quarter', 'Month', 'Day', 'DayOfWeek', 'DayName', 'IsWeekend']]


In [21]:
date_dim

Unnamed: 0,DateKey,Date,Year,Quarter,Month,Day,DayOfWeek,DayName,IsWeekend
0,20180101,01-01-2018,2018,1,1,1,1,Monday,0
1,20180102,02-01-2018,2018,1,1,2,2,Tuesday,0
2,20180103,03-01-2018,2018,1,1,3,3,Wednesday,0
3,20180104,04-01-2018,2018,1,1,4,4,Thursday,0
4,20180105,05-01-2018,2018,1,1,5,5,Friday,0
...,...,...,...,...,...,...,...,...,...
360,20181227,27-12-2018,2018,4,12,27,4,Thursday,0
361,20181228,28-12-2018,2018,4,12,28,5,Friday,0
362,20181229,29-12-2018,2018,4,12,29,6,Saturday,1
363,20181230,30-12-2018,2018,4,12,30,7,Sunday,1


In [22]:
# Show columns of the five tables: Orders, Details, customer_dim, product_dim and date_dim
print(f'merged_Orders_Details columns: {merged_Orders_Details.columns}')
print(f'customer_dim columns: {customer_dim.columns}')
print(f'product_dim columns: {product_dim.columns}')
print(f'date_dim columns: {date_dim.columns}')

merged_Orders_Details columns: Index(['OrderID', 'OrderDate', 'CustomerName', 'State', 'City', 'Amount',
       'Profit', 'Quantity', 'Category', 'SubCategory', 'PaymentMode'],
      dtype='object')
customer_dim columns: Index(['CustomerKey', 'CustomerName', 'State', 'City'], dtype='object')
product_dim columns: Index(['ProductKey', 'Category', 'SubCategory'], dtype='object')
date_dim columns: Index(['DateKey', 'Date', 'Year', 'Quarter', 'Month', 'Day', 'DayOfWeek',
       'DayName', 'IsWeekend'],
      dtype='object')


In [23]:
# Merge the five tables to create orders_fact table
merged_table = merged_Orders_Details.merge(
    customer_dim, on=['CustomerName', 'State', 'City'], how='left').merge(
        product_dim, on=['Category', 'SubCategory'], how='left').merge(
            date_dim[['Date', 'DateKey']], left_on='OrderDate', right_on='Date', how='left')

# Create a surrogate key starting from 1
merged_table = merged_table.reset_index(drop=True).reset_index().rename(columns={'index': 'OrdersKey'})
merged_table['OrdersKey'] += 1

# Select columns for orders_fact table
orders_fact = merged_table[[
    'OrdersKey', 'CustomerKey', 'ProductKey', 'DateKey', 'OrderID', 'OrderDate', 
    'Amount', 'Profit', 'Quantity', 'PaymentMode'
]]

In [25]:
orders_fact

Unnamed: 0,OrdersKey,CustomerKey,ProductKey,DateKey,OrderID,OrderDate,Amount,Profit,Quantity,PaymentMode
0,1,1,1,20180310,B-26055,10-03-2018,5729,64,14,EMI
1,2,1,2,20180310,B-26055,10-03-2018,671,114,9,Credit Card
2,3,1,3,20180310,B-26055,10-03-2018,443,11,1,COD
3,4,1,4,20180310,B-26055,10-03-2018,57,7,2,UPI
4,5,1,5,20180310,B-26055,10-03-2018,227,48,5,COD
...,...,...,...,...,...,...,...,...,...,...
1495,1496,403,15,20180803,B-25742,03-08-2018,11,-8,2,UPI
1496,1497,286,11,20180326,B-26088,26-03-2018,11,5,2,UPI
1497,1498,404,5,20180701,B-25707,01-07-2018,8,-6,1,COD
1498,1499,405,5,20180822,B-25758,22-08-2018,8,-2,1,COD


In [26]:
orders_fact.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   OrdersKey    1500 non-null   int64 
 1   CustomerKey  1500 non-null   int64 
 2   ProductKey   1500 non-null   int64 
 3   DateKey      1500 non-null   int64 
 4   OrderID      1500 non-null   object
 5   OrderDate    1500 non-null   object
 6   Amount       1500 non-null   int64 
 7   Profit       1500 non-null   int64 
 8   Quantity     1500 non-null   int64 
 9   PaymentMode  1500 non-null   object
dtypes: int64(7), object(3)
memory usage: 117.3+ KB


In [27]:
# Save the tables as CSVs
orders_fact.to_csv(r'Online Sales Data\orders_fact.csv', index=False)
customer_dim.to_csv(r'Online Sales Data\customer_dim.csv', index=False)
product_dim.to_csv(r'Online Sales Data\product_dim.csv', index=False)
date_dim.to_csv(r'Online Sales Data\date_dim.csv', index=False)


In [None]:
# Connect to SQL Server
DRIVER_NAME = 'SQL SERVER'
SERVER_NAME = r'MSI\SQLEXPRESS'
DATABASE_NAME = 'OnlineSales'

params = urllib.parse.quote_plus(
    "DRIVER={ODBC Driver 17 for SQL Server};"
    f"SERVER={SERVER_NAME};DATABASE={DATABASE_NAME};UID=UserName;PWD=password"  # Remove/replace UserName (UID) and Password (PWD) if needed
)

engine = sqlalchemy.create_engine(f"mssql+pyodbc:///?odbc_connect={params}")

# Push the tables to SQL Server
orders_fact.to_sql('orders_fact', engine, """if_exists='replace'""", index=False)      # use "if_exists='replace' if needed                         
customer_dim.to_sql('customer_dim', engine, """if_exists='replace'""", index=False)    # use "if_exists='replace' if needed  
product_dim.to_sql('product_dim', engine, """if_exists='replace'""", index=False)      # use "if_exists='replace' if needed  
date_dim.to_sql('date_dim', engine, """if_exists='replace'""", index=False)            # use "if_exists='replace' if needed  

132