# **Customer Shopping Preference Dataset Transformation For Data Modelling (Kimball/Star Schema Approach)**

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('shopping_trends_updated_2.csv')
shop_df = df.copy()
shop_df.head()

In [None]:
shop_df.info()

In [None]:
shop_df.rename(columns={
    'Customer ID':'Customer_ID',
    'Item Purchased':'Item_Purchased',
    'Purchase Amount (USD)':'Purchase_Amount_USD',
    'Review Rating':'Review_Rating',
    'Subscription Status':'Subscription_Status',
    'Shipping Type':'Shipping_Type',
    'Discount Applied':'Discount_Applied',
    'Promo Code Used':'Promo_Code_Used',
    'Previous Purchases':'Previous_Purchases',
    'Payment Method':'Payment_Method',
    'Frequency of Purchases':'Frequency_of_Purchases'
    },inplace=True)

In [None]:
shop_df

In [None]:
shop_df.duplicated().value_counts()

**Customers Dimension Table:**

In [None]:
dim_customers = shop_df[['Customer_ID', 'Age', 'Gender']]
dim_customers

**Items Dimension Table::**

In [None]:
dim_items = shop_df.copy()[['Item_Purchased','Category','Size', 'Color']]
dim_items.drop_duplicates(inplace=True)
dim_items.reset_index(inplace=True)
dim_items.drop('index',axis=1,inplace=True)
dim_items['Item_ID'] = dim_items.index+1
dim_items = dim_items[['Item_ID','Item_Purchased','Category','Size', 'Color']]
dim_items

**I will develop a function named 'get_dim_data' to retrieve distinct values from a specified column in a dataframe. This function will assign a primary key to the resulting dataframe and subsequently provide a dimension table as output:**

In [37]:
def get_dim_data(df,col,col_label):
    unique_col = [n for n in df[col].unique()]
    col_id = [x+1 for x in range(len(unique_col))]
    dim_df = pd.DataFrame({
        f'{col_label}_ID':col_id,
        f'{col_label}':unique_col
    })

    return dim_df

**Locations Dimension Table::**

In [None]:
dim_locations = get_dim_data(shop_df,'Location','Location')
dim_locations

**Payment Methods Dimension Table:**

In [None]:
dim_payment_methods = get_dim_data(shop_df,'Payment_Method','Payment_Method')
dim_payment_methods

**Shipping Types Dimension Table:**

In [None]:
dim_shipping_types = get_dim_data(shop_df,'Shipping_Type','Shipping_Type')
dim_shipping_types

**Transactions Fact Table:**

In [None]:
fct_transactions = shop_df.merge(dim_items,on=['Item_Purchased','Category','Size', 'Color'])\
.merge(dim_locations,on='Location')\
.merge(dim_payment_methods,on='Payment_Method')\
.merge(dim_shipping_types,on='Shipping_Type')\
[['Customer_ID','Item_ID', 'Location_ID',
  'Payment_Method_ID', 'Shipping_Type_ID',
  'Purchase_Amount_USD','Review_Rating',
  'Discount_Applied','Promo_Code_Used',
  'Subscription_Status','Previous_Purchases',
  'Frequency_of_Purchases','Season']]

fct_transactions['Transaction_ID'] = fct_transactions.index+1

fct_transactions = fct_transactions[
    ['Transaction_ID','Customer_ID','Item_ID', 
     'Location_ID','Payment_Method_ID', 'Shipping_Type_ID',
  'Purchase_Amount_USD','Review_Rating',
  'Discount_Applied','Promo_Code_Used',
  'Subscription_Status','Previous_Purchases',
  'Frequency_of_Purchases','Season']
  ]

fct_transactions

**Write fact and dimension tables to CSV files:**

In [None]:
# df_interested = [fct_transactions,dim_customers,dim_items,dim_locations,dim_payment_methods,dim_shipping_types]
# path = 'C:\\Users\\CORE i3\\Desktop\\Data Engineering-Analytics Project\\Subsets CSV\\'
# file_names = ['fct_transactions','dim_customers','dim_items','dim_locations','dim_payment_methods','dim_shipping_types']

# for df,name in zip(df_interested,file_names):
#     df.to_csv(path+name+'.csv',index=False)