### **Data Modeling Proccess**:
- Objective: This notebook presents the process of creating a dimensional model based on optimization, scalability, practicality, performance and, of course, data and its nature. 
---

In [1]:
#importing libraries
import numpy as np
import pandas as pd
import os

In [101]:
#path to data
path_to_data = os.path.join(os.getcwd(), '..', 'data', 'credit_card_transactions_api_preprocessed.csv')

#loading data and checking the shape
df = pd.read_csv(path_to_data)
df.shape

(1052352, 26)

In [12]:
#sampling data
df_sample = df.sample(n=300000, random_state=42)
df_sample.to_csv(os.path.join(os.getcwd(), '..', 'data', 'sample_credit_card_transactions_api_preprocessed.csv'), index=False, encoding='utf-8',sep=",")

# Working with Sample Data

In [2]:
# Remove the column display limit to show all columns in the DataFrame
pd.set_option('display.max_columns', None)

In [3]:
#path to sample data
path_to_data_sample = os.path.join(os.getcwd(), '..', 'data', 'sample_credit_card_transactions_api_preprocessed.csv')

#loading the sample data and checking the first 7 rows
df_sample = pd.read_csv(path_to_data_sample)
df_sample.head()

Unnamed: 0,id,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,job,dob,trans_num,is_fraud,merch_zipcode,age,country,country_code,state_abbreviation,state_name,state_population
0,978871,2020-02-01 21:32:26,213124978348176,fraud_Dibbert-Green,entertainment,39.91,Steven,Arnold,M,079 Chelsea Rest,Belfast,NY,14711,42.32,-78.0943,Mechanical engineer,1962-06-04,8dab2f2957f0986174cb68f79d2049e7,False,14560.0,58,United States,USA,NY,New York,20202320
1,804521,2019-12-05 14:40:17,4079773899158,"fraud_Kling, Howe and Schneider",home,60.95,Eric,Preston,M,7020 Doyle Stream Apt. 951,Mesa,ID,83643,44.6255,-116.4493,Cartographer,1965-12-15,aa2caf00d82bd93f23a5d757e4b82362,False,83610.0,54,United States,USA,ID,Idaho,1839117
2,1219397,2020-05-25 02:03:34,6517217825320610,fraud_Welch Inc,misc_net,270.27,James,Reese,M,26975 Richardson Mills Apt. 402,Sontag,MS,39665,31.6453,-90.1801,"Librarian, academic",1958-06-11,d708409e98b2523f05dfe1970e51c309,False,70789.0,62,United States,USA,MS,Mississippi,2961306
3,1177393,2020-05-06 07:53:23,4562827002127,fraud_Gutmann Ltd,grocery_net,61.07,Christopher,Johnson,M,28711 Kristine Junction Suite 309,Greenville,OH,45331,40.0987,-84.6342,Media planner,1971-11-26,9c4863f5659592c18ea8cfe1e9281c42,False,46127.0,49,United States,USA,OH,Ohio,11799331
4,1240638,2020-06-01 12:39:05,6538441737335434,fraud_Douglas-White,entertainment,135.09,Gina,Grimes,F,444 Robert Mews,Clarks Mills,PA,16114,41.3851,-80.1752,Energy manager,1997-09-22,557b6c103dea289a471690febe822031,False,44454.0,23,United States,USA,PA,Pennsylvania,13002788


## Previous Transformations to split the data

In [4]:
"""
Changing the transaction date to an incremental integer
"""
# making sure the data type is datetime
df_sample['trans_date_trans_time'] = pd.to_datetime(df_sample['trans_date_trans_time'])

# converting the transaction date to a incremental integer
df_sample['trans_date_id'] = df_sample['trans_date_trans_time'].dt.strftime('%Y%m%d').astype(int)

In [5]:
"""
Retrieve FIPS code for states using 'us' library
"""
import us

def get_fips_from_usps(usps_code):
    """
    This function takes a two-letter USPS code (state abbreviation) and returns the corresponding FIPS code.
    Args:
        usps_code (str): The two-letter state abbreviation (e.g., 'CA', 'NY').
    Returns:
        str: The FIPS code corresponding to the state, or None if the USPS code is not found.
    """
    state = us.states.lookup(usps_code)
    if state:
        return state.fips
    else:
        return None  # Returns None if the USPS code is not found

# Apply the function to generate a new column 'state_id' with the FIPS codes
df_sample['state_id'] = df_sample['state_abbreviation'].apply(get_fips_from_usps)



In [6]:
"""
Creating a unique ID for each category
"""

# Create a dictionary that maps each unique 'category' value to an incremental number
category_mapping = {category: idx for idx, category in enumerate(df_sample['category'].unique(), start=0)}

# Use the dictionary to create the 'category_id' column
df_sample['category_id'] = df_sample['category'].map(category_mapping)


In [7]:
"""
Creating a unique ID for each job
"""

# Create a dictionary that maps each unique 'category' value to an incremental number
job_mapping = {job: idx for idx, job in enumerate(df_sample['job'].unique(), start=1010)}

# Use the dictionary to create the 'job_id' column
df_sample['job_id'] = df_sample['job'].map(job_mapping)


In [8]:
"""
Location ID generation by concatenating 'zip', 'state_id', and an incrementing number
"""
counter = 1
def generate_unique_id(row):
    global counter
    unique_id = str(row['state_id']) + str(row['zip']) + str(counter)
    counter += 1  # Increment the counter for the next row
    return unique_id

# Apply the function to generate the 'unique_id' column in df_sample
df_sample['location_id'] = df_sample.apply(generate_unique_id, axis=1)

In [9]:
# Count the number of duplicates in the 'unique_id' column
num_duplicates = df_sample['location_id'].duplicated().sum()
num_duplicates

np.int64(0)

In [10]:
# Function to split the 'trans_date_trans_time' column into multiple new columns
def split_datetime_column(df):
    # Create 'date' column (YYYY-MM-DD format)
    df['date'] = pd.to_datetime(df['trans_date_trans_time']).dt.date
    
    # Create 'year' column (YYYY)
    df['year'] = pd.to_datetime(df['trans_date_trans_time']).dt.year
    
    # Create 'month' column (MM)
    df['month'] = pd.to_datetime(df['trans_date_trans_time']).dt.month
    
    # Create 'quarter' column (Q)
    df['quarter'] = pd.to_datetime(df['trans_date_trans_time']).dt.quarter
    
    # Create 'hour' column (HH:MM:SS)
    df['hour'] = pd.to_datetime(df['trans_date_trans_time']).dt.time
    
    return df

df_sample = split_datetime_column(df_sample)

In [11]:
#convert 'merch_zipcode' to integer
df_sample['merch_zipcode'] = df_sample['merch_zipcode'].apply(lambda x: int(x))

In [12]:
# verifying the changes
df_sample.head()

Unnamed: 0,id,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,job,dob,trans_num,is_fraud,merch_zipcode,age,country,country_code,state_abbreviation,state_name,state_population,trans_date_id,state_id,category_id,job_id,location_id,date,year,month,quarter,hour
0,978871,2020-02-01 21:32:26,213124978348176,fraud_Dibbert-Green,entertainment,39.91,Steven,Arnold,M,079 Chelsea Rest,Belfast,NY,14711,42.32,-78.0943,Mechanical engineer,1962-06-04,8dab2f2957f0986174cb68f79d2049e7,False,14560,58,United States,USA,NY,New York,20202320,20200201,36,0,1010,36147111,2020-02-01,2020,2,1,21:32:26
1,804521,2019-12-05 14:40:17,4079773899158,"fraud_Kling, Howe and Schneider",home,60.95,Eric,Preston,M,7020 Doyle Stream Apt. 951,Mesa,ID,83643,44.6255,-116.4493,Cartographer,1965-12-15,aa2caf00d82bd93f23a5d757e4b82362,False,83610,54,United States,USA,ID,Idaho,1839117,20191205,16,1,1011,16836432,2019-12-05,2019,12,4,14:40:17
2,1219397,2020-05-25 02:03:34,6517217825320610,fraud_Welch Inc,misc_net,270.27,James,Reese,M,26975 Richardson Mills Apt. 402,Sontag,MS,39665,31.6453,-90.1801,"Librarian, academic",1958-06-11,d708409e98b2523f05dfe1970e51c309,False,70789,62,United States,USA,MS,Mississippi,2961306,20200525,28,2,1012,28396653,2020-05-25,2020,5,2,02:03:34
3,1177393,2020-05-06 07:53:23,4562827002127,fraud_Gutmann Ltd,grocery_net,61.07,Christopher,Johnson,M,28711 Kristine Junction Suite 309,Greenville,OH,45331,40.0987,-84.6342,Media planner,1971-11-26,9c4863f5659592c18ea8cfe1e9281c42,False,46127,49,United States,USA,OH,Ohio,11799331,20200506,39,3,1013,39453314,2020-05-06,2020,5,2,07:53:23
4,1240638,2020-06-01 12:39:05,6538441737335434,fraud_Douglas-White,entertainment,135.09,Gina,Grimes,F,444 Robert Mews,Clarks Mills,PA,16114,41.3851,-80.1752,Energy manager,1997-09-22,557b6c103dea289a471690febe822031,False,44454,23,United States,USA,PA,Pennsylvania,13002788,20200601,42,0,1014,42161145,2020-06-01,2020,6,2,12:39:05


# Dimension Creation

In [14]:
fact_T_transation_dim = df_sample[['trans_num','is_fraud','amt','hour','trans_date_id','trans_date_trans_time','cc_num','location_id','merch_zipcode']]
fact_T_transation_dim.head()

Unnamed: 0,trans_num,is_fraud,amt,hour,trans_date_id,trans_date_trans_time,cc_num,location_id,merch_zipcode
0,8dab2f2957f0986174cb68f79d2049e7,False,39.91,21:32:26,20200201,2020-02-01 21:32:26,213124978348176,36147111,14560
1,aa2caf00d82bd93f23a5d757e4b82362,False,60.95,14:40:17,20191205,2019-12-05 14:40:17,4079773899158,16836432,83610
2,d708409e98b2523f05dfe1970e51c309,False,270.27,02:03:34,20200525,2020-05-25 02:03:34,6517217825320610,28396653,70789
3,9c4863f5659592c18ea8cfe1e9281c42,False,61.07,07:53:23,20200506,2020-05-06 07:53:23,4562827002127,39453314,46127
4,557b6c103dea289a471690febe822031,False,135.09,12:39:05,20200601,2020-06-01 12:39:05,6538441737335434,42161145,44454


In [15]:
category_dim = df_sample[['category_id','category']]
category_dim.head()

Unnamed: 0,category_id,category
0,0,entertainment
1,1,home
2,2,misc_net
3,3,grocery_net
4,0,entertainment


In [16]:
merchant_dim = df_sample[['merch_zipcode','merchant','category_id']]
merchant_dim.head()

Unnamed: 0,merch_zipcode,merchant,category_id
0,14560,fraud_Dibbert-Green,0
1,83610,"fraud_Kling, Howe and Schneider",1
2,70789,fraud_Welch Inc,2
3,46127,fraud_Gutmann Ltd,3
4,44454,fraud_Douglas-White,0


In [17]:
client_dim = df_sample[['cc_num','first','last','gender','job_id','age']]
client_dim.head()

Unnamed: 0,cc_num,first,last,gender,job_id,age
0,213124978348176,Steven,Arnold,M,1010,58
1,4079773899158,Eric,Preston,M,1011,54
2,6517217825320610,James,Reese,M,1012,62
3,4562827002127,Christopher,Johnson,M,1013,49
4,6538441737335434,Gina,Grimes,F,1014,23


In [18]:
job_dim = df_sample[['job_id','job']]
job_dim.head()

Unnamed: 0,job_id,job
0,1010,Mechanical engineer
1,1011,Cartographer
2,1012,"Librarian, academic"
3,1013,Media planner
4,1014,Energy manager


In [19]:
date_dim = df_sample[['trans_date_id','date','month','year','quarter']]
date_dim.head()

Unnamed: 0,trans_date_id,date,month,year,quarter
0,20200201,2020-02-01,2,2020,1
1,20191205,2019-12-05,12,2019,4
2,20200525,2020-05-25,5,2020,2
3,20200506,2020-05-06,5,2020,2
4,20200601,2020-06-01,6,2020,2


In [20]:
location_dim = df_sample[['location_id','street','lat','long','zip','state_id']]
location_dim.head()

Unnamed: 0,location_id,street,lat,long,zip,state_id
0,36147111,079 Chelsea Rest,42.32,-78.0943,14711,36
1,16836432,7020 Doyle Stream Apt. 951,44.6255,-116.4493,83643,16
2,28396653,26975 Richardson Mills Apt. 402,31.6453,-90.1801,39665,28
3,39453314,28711 Kristine Junction Suite 309,40.0987,-84.6342,45331,39
4,42161145,444 Robert Mews,41.3851,-80.1752,16114,42


In [21]:
state_dim = df_sample[['state_id','state_abbreviation','state_name','state_population']]
state_dim.head()

Unnamed: 0,state_id,state_abbreviation,state_name,state_population
0,36,NY,New York,20202320
1,16,ID,Idaho,1839117
2,28,MS,Mississippi,2961306
3,39,OH,Ohio,11799331
4,42,PA,Pennsylvania,13002788


# Saving the dimensions like CSV files

In [None]:
"""
This function removes duplicate data
from a specific DataFrame using the specified column.
"""
def dim_drop_duplicates(df, col):
    df.drop_duplicates(subset=[col], inplace=True)
    return df

In [25]:
"""
Removes duplicate data for each dimension before saving the changes
to CSV files.
"""

def save_dfs_to_csv(dfs, columns, filenames):
    """
    Apply dim_drop_duplicates on each DataFrame and save as a CSV file in a specific directory.

    Args:
        dfs (list): List of DataFrames to process.
        columns (list): List of column names for dropping duplicates in each DataFrame.
        filenames (list): List of filenames to save the CSVs.
    """
    # Folder where CSV files will be saved
    path_to_data_folder = os.path.join(os.getcwd(), '..', 'data', 'dimensions')

    # Ensure the folder exists, create it if it doesn't
    if not os.path.exists(path_to_data_folder):
        os.makedirs(path_to_data_folder)

    for df, col, filename in zip(dfs, columns, filenames):
        # Apply dim_drop_duplicates
        df = dim_drop_duplicates(df, col)
        
        # Create the full path for the CSV file
        full_path = os.path.join(path_to_data_folder, filename)
        
        # Save to CSV
        df.to_csv(full_path, index=False)
        print(f"Saved {filename} to {full_path}")

# List of DataFrames
dfs = [category_dim, merchant_dim, client_dim, job_dim, date_dim, location_dim, state_dim]

# Corresponding columns to check for duplicates
columns = ['category_id', 'merch_zipcode', 'cc_num', 'job_id', 'trans_date_id', 'location_id', 'state_id']

# Corresponding filenames
filenames = ['category_dim.csv', 'merchant_dim.csv', 'client_dim.csv', 'job_dim.csv', 'date_dim.csv', 'location_dim.csv', 'state_dim.csv']

# Call the function to process and save each DataFrame
save_dfs_to_csv(dfs, columns, filenames)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(subset=[col], inplace=True)


Saved category_dim.csv to d:\User\Documents\UAO\6_Semestre\ETL\Project\Credit_Card_ETL_Project\notebooks\..\data\dimensions\category_dim.csv
Saved merchant_dim.csv to d:\User\Documents\UAO\6_Semestre\ETL\Project\Credit_Card_ETL_Project\notebooks\..\data\dimensions\merchant_dim.csv
Saved client_dim.csv to d:\User\Documents\UAO\6_Semestre\ETL\Project\Credit_Card_ETL_Project\notebooks\..\data\dimensions\client_dim.csv
Saved job_dim.csv to d:\User\Documents\UAO\6_Semestre\ETL\Project\Credit_Card_ETL_Project\notebooks\..\data\dimensions\job_dim.csv
Saved date_dim.csv to d:\User\Documents\UAO\6_Semestre\ETL\Project\Credit_Card_ETL_Project\notebooks\..\data\dimensions\date_dim.csv
Saved location_dim.csv to d:\User\Documents\UAO\6_Semestre\ETL\Project\Credit_Card_ETL_Project\notebooks\..\data\dimensions\location_dim.csv
Saved state_dim.csv to d:\User\Documents\UAO\6_Semestre\ETL\Project\Credit_Card_ETL_Project\notebooks\..\data\dimensions\state_dim.csv
