### **Data Modeling Proccess**:
- Objective: This notebook presents the process of creating a dimensional model based on optimization, scalability, practicality, performance and, of course, data and its nature. 
---

In [99]:
#importing libraries
import numpy as np
import pandas as pd
import os

In [101]:
#path to data
path_to_data = os.path.join(os.getcwd(), '..', 'data', 'credit_card_transactions_api_preprocessed.csv')

#loading data and checking the shape
df = pd.read_csv(path_to_data)
df.shape

(1052352, 26)

In [12]:
#sampling data
df_sample = df.sample(n=300000, random_state=42)
df_sample.to_csv(os.path.join(os.getcwd(), '..', 'data', 'sample_credit_card_transactions_api_preprocessed.csv'), index=False, encoding='utf-8',sep=",")

# Working with Sample Data

In [102]:
# Remove the column display limit to show all columns in the DataFrame
pd.set_option('display.max_columns', None)

In [103]:
#path to sample data
path_to_data_sample = os.path.join(os.getcwd(), '..', 'data', 'sample_credit_card_transactions_api_preprocessed.csv')

#loading the sample data and checking the first 7 rows
df_sample = pd.read_csv(path_to_data_sample)
df_sample.head()

Unnamed: 0,id,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,job,dob,trans_num,is_fraud,merch_zipcode,age,country,country_code,state_abbreviation,state_name,state_population
0,978871,2020-02-01 21:32:26,213124978348176,fraud_Dibbert-Green,entertainment,39.91,Steven,Arnold,M,079 Chelsea Rest,Belfast,NY,14711,42.32,-78.0943,Mechanical engineer,1962-06-04,8dab2f2957f0986174cb68f79d2049e7,False,14560.0,58,United States,USA,NY,New York,20202320
1,804521,2019-12-05 14:40:17,4079773899158,"fraud_Kling, Howe and Schneider",home,60.95,Eric,Preston,M,7020 Doyle Stream Apt. 951,Mesa,ID,83643,44.6255,-116.4493,Cartographer,1965-12-15,aa2caf00d82bd93f23a5d757e4b82362,False,83610.0,54,United States,USA,ID,Idaho,1839117
2,1219397,2020-05-25 02:03:34,6517217825320610,fraud_Welch Inc,misc_net,270.27,James,Reese,M,26975 Richardson Mills Apt. 402,Sontag,MS,39665,31.6453,-90.1801,"Librarian, academic",1958-06-11,d708409e98b2523f05dfe1970e51c309,False,70789.0,62,United States,USA,MS,Mississippi,2961306
3,1177393,2020-05-06 07:53:23,4562827002127,fraud_Gutmann Ltd,grocery_net,61.07,Christopher,Johnson,M,28711 Kristine Junction Suite 309,Greenville,OH,45331,40.0987,-84.6342,Media planner,1971-11-26,9c4863f5659592c18ea8cfe1e9281c42,False,46127.0,49,United States,USA,OH,Ohio,11799331
4,1240638,2020-06-01 12:39:05,6538441737335434,fraud_Douglas-White,entertainment,135.09,Gina,Grimes,F,444 Robert Mews,Clarks Mills,PA,16114,41.3851,-80.1752,Energy manager,1997-09-22,557b6c103dea289a471690febe822031,False,44454.0,23,United States,USA,PA,Pennsylvania,13002788


## Previous Transformations to split the data

In [104]:
"""
Changing the transaction date to an incremental integer
"""
# making sure the data type is datetime
df_sample['trans_date_trans_time'] = pd.to_datetime(df_sample['trans_date_trans_time'])

# converting the transaction date to a incremental integer
df_sample['trans_date_trans_time'] = df_sample['trans_date_trans_time'].dt.strftime('%Y%m%d%H%M%S').astype(int)

In [105]:
"""
ID generation for each specific location.
""" 

#Create a dictionary to store the counter for each country, state, and city
country_counter = {}
state_counter = {}
city_counter = {}

# Function to generate the ID for each specific location
def generate_id(row):
    """
    This function generates a unique ID for each specific location based 
    on the country, state, and city. 
    country: USA, MEX, CAN, etc.
    state_name: California, New York, etc.
    city: Los Angeles, New York, etc.
    Example: 112 means country 1, state 1, city 2. -> USA, California, Los Angeles
    """
    # Counter for the country
    if row['country'] not in country_counter:
        country_counter[row['country']] = len(country_counter) + 1
    
    # counter for the state within the country
    country_id = country_counter[row['country']]
    if (row['country'], row['state_name']) not in state_counter:
        state_counter[(row['country'], row['state_name'])] = len([state for state in state_counter if state[0] == row['country']]) + 1
    
    # counter for the city within the state
    state_id = state_counter[(row['country'], row['state_name'])]
    if (row['country'], row['state_name'], row['city']) not in city_counter:
        city_counter[(row['country'], row['state_name'], row['city'])] = len([city for city in city_counter if city[0] == row['country'] and city[1] == row['state_name']]) + 1
    
    # generate the ID
    city_id = city_counter[(row['country'], row['state_name'], row['city'])]
    return f"{country_id}{state_id}{city_id}"

# apply the function to the DataFrame and create a new column
df_sample['PK_specific_location'] = df_sample.apply(generate_id, axis=1)

In [106]:
"""
Creating a unique ID for each category
"""

# Create a dictionary that maps each unique 'category' value to an incremental number
category_mapping = {category: idx for idx, category in enumerate(df_sample['category'].unique(), start=0)}

# Use the dictionary to create the 'category_id' column
df_sample['category_id'] = df_sample['category'].map(category_mapping)


In [107]:
import sys

"""
We calcculate the size of the 'category' column and compare it to the size of a list of integers from 0 to 14.
And depending on the ressult, we can determine if its better create a new column with the ID or not.
"""
# Calculate the number of unique categories
print(f"The number of unique categories is: {len(df_sample['category'].unique())}")

# Calculate the size of the 'category' column in bytes and convert to MB
size = sys.getsizeof(df_sample['category'])
size = size * 4  
size_mb = size / (1024 * 1024)  # Convert to MB
print(f"The size of the 'category' column is: {size_mb:.6f} MB")

# Create a list and calculate its size in bytes and convert to MB
my_list = [x for x in range(0, 14)]
size_list = sys.getsizeof(my_list)
size_list_mb = size_list / (1024 * 1024)  # Convert to MB
print(f"The size of the list is: {size_list_mb:.6f} MB")
print(f"Category column is {size_mb/size_list_mb} times larger than a list of integers from 0 to 14")



The number of unique categories is: 14
The size of the 'category' column is: 68.107155 MB
The size of the list is: 0.000175 MB
Category column is 388127.8695652174 times larger than a list of integers from 0 to 14


In [108]:
# verifying the changes
df_sample.head()

Unnamed: 0,id,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,job,dob,trans_num,is_fraud,merch_zipcode,age,country,country_code,state_abbreviation,state_name,state_population,PK_specific_location,category_id
0,978871,20200201213226,213124978348176,fraud_Dibbert-Green,entertainment,39.91,Steven,Arnold,M,079 Chelsea Rest,Belfast,NY,14711,42.32,-78.0943,Mechanical engineer,1962-06-04,8dab2f2957f0986174cb68f79d2049e7,False,14560.0,58,United States,USA,NY,New York,20202320,111,0
1,804521,20191205144017,4079773899158,"fraud_Kling, Howe and Schneider",home,60.95,Eric,Preston,M,7020 Doyle Stream Apt. 951,Mesa,ID,83643,44.6255,-116.4493,Cartographer,1965-12-15,aa2caf00d82bd93f23a5d757e4b82362,False,83610.0,54,United States,USA,ID,Idaho,1839117,121,1
2,1219397,20200525020334,6517217825320610,fraud_Welch Inc,misc_net,270.27,James,Reese,M,26975 Richardson Mills Apt. 402,Sontag,MS,39665,31.6453,-90.1801,"Librarian, academic",1958-06-11,d708409e98b2523f05dfe1970e51c309,False,70789.0,62,United States,USA,MS,Mississippi,2961306,131,2
3,1177393,20200506075323,4562827002127,fraud_Gutmann Ltd,grocery_net,61.07,Christopher,Johnson,M,28711 Kristine Junction Suite 309,Greenville,OH,45331,40.0987,-84.6342,Media planner,1971-11-26,9c4863f5659592c18ea8cfe1e9281c42,False,46127.0,49,United States,USA,OH,Ohio,11799331,141,3
4,1240638,20200601123905,6538441737335434,fraud_Douglas-White,entertainment,135.09,Gina,Grimes,F,444 Robert Mews,Clarks Mills,PA,16114,41.3851,-80.1752,Energy manager,1997-09-22,557b6c103dea289a471690febe822031,False,44454.0,23,United States,USA,PA,Pennsylvania,13002788,151,0


In [109]:
fact_T_transation_dim = df_sample[['trans_num','is_fraud','amt','trans_date_trans_time','PK_specific_location','merch_zipcode','cc_num']]
fact_T_transation_dim.head()

Unnamed: 0,trans_num,is_fraud,amt,trans_date_trans_time,PK_specific_location,merch_zipcode,cc_num
0,8dab2f2957f0986174cb68f79d2049e7,False,39.91,20200201213226,111,14560.0,213124978348176
1,aa2caf00d82bd93f23a5d757e4b82362,False,60.95,20191205144017,121,83610.0,4079773899158
2,d708409e98b2523f05dfe1970e51c309,False,270.27,20200525020334,131,70789.0,6517217825320610
3,9c4863f5659592c18ea8cfe1e9281c42,False,61.07,20200506075323,141,46127.0,4562827002127
4,557b6c103dea289a471690febe822031,False,135.09,20200601123905,151,44454.0,6538441737335434


In [110]:
category_dim = df_sample[['category_id','category']]
category_dim.head()

Unnamed: 0,category_id,category
0,0,entertainment
1,1,home
2,2,misc_net
3,3,grocery_net
4,0,entertainment


In [111]:
merchant_dim = df_sample[['merch_zipcode','merchant','category_id']]
merchant_dim.head()

Unnamed: 0,merch_zipcode,merchant,category_id
0,14560.0,fraud_Dibbert-Green,0
1,83610.0,"fraud_Kling, Howe and Schneider",1
2,70789.0,fraud_Welch Inc,2
3,46127.0,fraud_Gutmann Ltd,3
4,44454.0,fraud_Douglas-White,0


In [112]:
"""
We do not store the 'age' feature in the client_dim table because, in this case,
age is not a feature that will be consulted repeatedly, so we can calculate it
on demand.
""" 
client_dim = df_sample[['cc_num','first','last','gender','job','dob']]
client_dim.head()

Unnamed: 0,cc_num,first,last,gender,job,dob
0,213124978348176,Steven,Arnold,M,Mechanical engineer,1962-06-04
1,4079773899158,Eric,Preston,M,Cartographer,1965-12-15
2,6517217825320610,James,Reese,M,"Librarian, academic",1958-06-11
3,4562827002127,Christopher,Johnson,M,Media planner,1971-11-26
4,6538441737335434,Gina,Grimes,F,Energy manager,1997-09-22


In [113]:
"""
This dimension and the next one share the same primary key 
because they are related to the same entity, but each contains
different information about it
"""
location_dim = df_sample[['PK_specific_location','street','lat','long','zip']]
location_dim.head()

Unnamed: 0,PK_specific_location,street,lat,long,zip
0,111,079 Chelsea Rest,42.32,-78.0943,14711
1,121,7020 Doyle Stream Apt. 951,44.6255,-116.4493,83643
2,131,26975 Richardson Mills Apt. 402,31.6453,-90.1801,39665
3,141,28711 Kristine Junction Suite 309,40.0987,-84.6342,45331
4,151,444 Robert Mews,41.3851,-80.1752,16114


In [114]:
country_info_dim = df_sample[['PK_specific_location','country','state_name','city','state_population']]
country_info_dim.head()

Unnamed: 0,PK_specific_location,country,state_name,city,state_population
0,111,United States,New York,Belfast,20202320
1,121,United States,Idaho,Mesa,1839117
2,131,United States,Mississippi,Sontag,2961306
3,141,United States,Ohio,Greenville,11799331
4,151,United States,Pennsylvania,Clarks Mills,13002788


In [115]:
country_info_dim.duplicated().sum()

np.int64(299107)

In [116]:
country_info_dim.drop_duplicates(subset=['PK_specific_location'], inplace=True)
country_info_dim.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  country_info_dim.drop_duplicates(subset=['PK_specific_location'], inplace=True)


(820, 5)