# Data Cleanup

**input**: 
- "../data/silver/df_edges_raw.csv"
- "../data/silver/df_nodes_user_raw.csv"
- "../data/silver/df_nodes_merchant_raw.csv"

**output**: 
- "../data/gold/df_edges.csv"
- "../data/gold/df_nodes_user.csv"
- "../data/gold/df_nodes_merchant.csv"

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
df_edges = pd.read_csv("../data/silver/u100_df_edges_raw.csv") # delete "sample" part for final run
df_nodes_user = pd.read_csv("../data/silver/u100_df_nodes_user_raw.csv") # delete "sample" part for final run
df_nodes_merchant = pd.read_csv("../data/silver/u100_df_nodes_merchant_raw.csv")# delete "sample" part for final run

In [3]:
display(df_edges)
df_edges.info()

Unnamed: 0,user_id,Card,Year,Month,Day,Time,Amount,Use Chip,Errors?,Is Fraud?,...,Card Type,Card Number,Expires,CVV,Has Chip,Cards Issued,Credit Limit,Acct Open Date,Year PIN last Changed,Card on Dark Web
0,0,0,2002,9,1,06:21,$134.09,Swipe Transaction,,No,...,Debit,4344676511950444,12/2022,623,YES,2,$24295,09/2002,2008,No
1,0,0,2002,9,1,06:42,$38.48,Swipe Transaction,,No,...,Debit,4344676511950444,12/2022,623,YES,2,$24295,09/2002,2008,No
2,0,0,2002,9,2,06:22,$120.34,Swipe Transaction,,No,...,Debit,4344676511950444,12/2022,623,YES,2,$24295,09/2002,2008,No
3,0,0,2002,9,2,17:45,$128.95,Swipe Transaction,,No,...,Debit,4344676511950444,12/2022,623,YES,2,$24295,09/2002,2008,No
4,0,0,2002,9,3,06:23,$104.71,Swipe Transaction,,No,...,Debit,4344676511950444,12/2022,623,YES,2,$24295,09/2002,2008,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1258228,99,1,2020,2,26,15:53,$28.02,Chip Transaction,,No,...,Debit,5073514573038606,02/2023,802,YES,1,$3106,02/2020,2020,No
1258229,99,1,2020,2,27,12:44,$15.25,Chip Transaction,,No,...,Debit,5073514573038606,02/2023,802,YES,1,$3106,02/2020,2020,No
1258230,99,1,2020,2,27,20:04,$40.00,Chip Transaction,,No,...,Debit,5073514573038606,02/2023,802,YES,1,$3106,02/2020,2020,No
1258231,99,1,2020,2,28,06:33,$146.31,Chip Transaction,,No,...,Debit,5073514573038606,02/2023,802,YES,1,$3106,02/2020,2020,No


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1258233 entries, 0 to 1258232
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype 
---  ------                 --------------    ----- 
 0   user_id                1258233 non-null  int64 
 1   Card                   1258233 non-null  int64 
 2   Year                   1258233 non-null  int64 
 3   Month                  1258233 non-null  int64 
 4   Day                    1258233 non-null  int64 
 5   Time                   1258233 non-null  object
 6   Amount                 1258233 non-null  object
 7   Use Chip               1258233 non-null  object
 8   Errors?                20054 non-null    object
 9   Is Fraud?              1258233 non-null  object
 10  merchant_id            1258233 non-null  int64 
 11  CARD INDEX             1258233 non-null  int64 
 12  Card Brand             1258233 non-null  object
 13  Card Type              1258233 non-null  object
 14  Card Number            1258233 non

## Merchant Nodes cleanup

In [4]:
def clean_merchant_nodes(df_nodes_merchant):
    """
    Clean the merchant nodes DataFrame.
    """
    df = df_nodes_merchant.copy()
    
    # 1. Basic cleaning
    df_clean = pd.DataFrame()
    df_clean['merchant_id'] = df['merchant_id']
    df_clean['merchant_name'] = df['Merchant Name']
    df_clean['city'] = df['Merchant City'].str.strip()
    df_clean['state'] = df['Merchant State'].str.strip()
    df_clean['zipcode'] = pd.to_numeric(df['Zip'], errors='coerce')
    df_clean['mcc'] = pd.to_numeric(df['MCC'], errors='coerce')  # Merchant Category Code
    
    # 2. Handle missing values
    df_clean['zipcode'].fillna(0, inplace=True)  # 0 for unknown zip
    
    if df_clean['mcc'].isna().any():
        df_clean['mcc'].fillna(df_clean['mcc'].median(), inplace=True)
    
    # Fill missing location data
    df_clean['city'].fillna('Unknown', inplace=True)
    df_clean['state'].fillna('Unknown', inplace=True)
    df_clean['merchant_name'].fillna('Unknown', inplace=True)
    
    return df_clean


## Edges cleanup

In [5]:
def clean_edges_dataframe(df_edges):
    """
    Clean the edges DataFrame (transactions).
    Returns cleaned DataFrame with proper types and handles missing values.
    """
    df = df_edges.copy()
    
    # 1. Parse Amount (remove $ and convert to float)
    df['amount'] = df['Amount'].str.replace('$', '').str.replace(',', '').astype(float)
    
    # 2. Handle date/time columns - create proper datetime
    df['transaction_datetime'] = pd.to_datetime(
        df['Year'].astype(str) + '-' + 
        df['Month'].astype(str).str.zfill(2) + '-' + 
        df['Day'].astype(str).str.zfill(2) + ' ' + 
        df['Time'],
        format='%Y-%m-%d %H:%M',
        errors='coerce'
    )
    
    # 3. Clean categorical columns
    # Standardize Use Chip column
    df['use_chip'] = df['Use Chip'].str.strip()
    
    # Handle Errors column (NaN means no errors)
    df['has_error'] = df['Errors?'].notna().astype(int)
    
    # Clean Is Fraud? column - convert to binary
    df['is_fraud'] = (df['Is Fraud?'].str.strip().str.lower() == 'yes').astype(int)
    
    # 4. Card information
    df['card_brand'] = df['Card Brand'].str.strip()
    df['card_type'] = df['Card Type'].str.strip()
    
    # Parse expiration date
    df['card_expires'] = pd.to_datetime(df['Expires'], format='%m/%Y', errors='coerce')
    
    # Has Chip - convert to binary
    df['has_chip'] = (df['Has Chip'].str.strip().str.upper() == 'YES').astype(int)
    
    # 5. Clean numerical columns
    # Clean Credit Limit (remove $ and convert)
    df['credit_limit'] = df['Credit Limit'].str.replace('$', '').str.replace(',', '').astype(float)
    
    # Parse Account Open Date
    df['acct_open_date'] = pd.to_datetime(df['Acct Open Date'], format='%m/%Y', errors='coerce')
    
    # Card on Dark Web - convert to binary
    df['card_on_dark_web'] = (df['Card on Dark Web'].str.strip().str.lower() == 'yes').astype(int)
    
    # 6. Create useful features for GNN
    # Transaction hour
    df['transaction_hour'] = df['transaction_datetime'].dt.hour
    
    # Transaction day of week (0=Monday, 6=Sunday)
    df['transaction_dow'] = df['transaction_datetime'].dt.dayofweek
    
    # Account age in days
    df['account_age_days'] = (df['transaction_datetime'] - df['acct_open_date']).dt.days
    
    # Time since PIN was changed
    df['years_since_pin_change'] = df['Year'] - df['Year PIN last Changed']
    
    # Card age (months until expiry)
    df['months_until_expiry'] = ((df['card_expires'] - df['transaction_datetime']).dt.days / 30).round()
    
    # 7. Select final columns for GNN
    df_clean = pd.DataFrame({
        # IDs for graph structure
        'user_id': df['user_id'],
        'card_id': df['Card'],
        'merchant_id': df['merchant_id'],
        'card_index': df['CARD INDEX'],
        
        # Target variable
        'label': df['is_fraud'],
        
        # Transaction features
        'amount': df['amount'],
        'timestamp': df['transaction_datetime'],
        'transaction_hour': df['transaction_hour'],
        'day_of_week': df['transaction_dow'],
        
        # Card usage features
        'use_chip': df['use_chip'],
        'has_error': df['has_error'],
        
        # Card properties
        'card_brand': df['card_brand'],
        'card_type': df['card_type'],
        'card_number': df['Card Number'],
        'cvv': df['CVV'],
        'has_chip': df['has_chip'],
        'cards_issued': df['Cards Issued'],
        'credit_limit': df['credit_limit'],
        'card_on_dark_web': df['card_on_dark_web'],
        
        # Derived temporal features
        'account_age_days': df['account_age_days'],
        'years_since_pin_change': df['years_since_pin_change'],
        'months_until_expiry': df['months_until_expiry']
    })
    
    # 8. Handle missing values
    # Fill numerical missing values with median
    num_cols = ['amount', 'credit_limit', 'account_age_days', 'cards_issued', 
                'months_until_expiry', 'transaction_hour', 'day_of_week']
    for col in num_cols:
        if df_clean[col].isna().any():
            df_clean[col].fillna(df_clean[col].median(), inplace=True)
    
    # Fill years_since_pin_change with 0 if negative or NaN
    df_clean['years_since_pin_change'] = df_clean['years_since_pin_change'].clip(lower=0)
    df_clean['years_since_pin_change'].fillna(0, inplace=True)
    
    # Fill categorical with mode or 'unknown'
    if df_clean['use_chip'].isna().any():
        df_clean['use_chip'].fillna('Unknown', inplace=True)
    
    if df_clean['card_brand'].isna().any():
        df_clean['card_brand'].fillna('Unknown', inplace=True)
    
    return df_clean

# User nodes Cleanup


In [6]:

def clean_user_nodes(df_nodes_user):
    """
    Clean the user nodes DataFrame.
    """
    df = df_nodes_user.copy()
    
    # 1. Rename and select relevant columns
    df_clean = pd.DataFrame()
    df_clean['user_id'] = df['user_id']
    df_clean['name'] = df['Person']
    df_clean['current_age'] = pd.to_numeric(df['Current Age'], errors='coerce')
    df_clean['retirement_age'] = pd.to_numeric(df['Retirement Age'], errors='coerce')
    df_clean['birth_year'] = pd.to_numeric(df['Birth Year'], errors='coerce')
    df_clean['birth_month'] = pd.to_numeric(df['Birth Month'], errors='coerce')
    df_clean['gender'] = df['Gender'].str.strip()
    
    # 2. Location features
    df_clean['address'] = df['Address'].str.strip()
    df_clean['apartment'] = df['Apartment']
    df_clean['city'] = df['City'].str.strip()
    df_clean['state'] = df['State'].str.strip()
    df_clean['zipcode'] = df['Zipcode'].astype(str).str.strip()
    df_clean['latitude'] = pd.to_numeric(df['Latitude'], errors='coerce')
    df_clean['longitude'] = pd.to_numeric(df['Longitude'], errors='coerce')
    
    # 3. Financial features - clean currency columns
    df_clean['per_capita_income_zipcode'] = (
        df['Per Capita Income - Zipcode']
        .str.replace('$', '').str.replace(',', '')
        .astype(float)
    )
    df_clean['yearly_income_person'] = (
        df['Yearly Income - Person']
        .str.replace('$', '').str.replace(',', '')
        .astype(float)
    )
    df_clean['total_debt'] = (
        df['Total Debt']
        .str.replace('$', '').str.replace(',', '')
        .astype(float)
    )
    
    df_clean['fico_score'] = pd.to_numeric(df['FICO Score'], errors='coerce')
    df_clean['num_credit_cards'] = pd.to_numeric(df['Num Credit Cards'], errors='coerce')
    
    # 4. Create derived features
    # Debt to income ratio
    df_clean['debt_to_income_ratio'] = df_clean['total_debt'] / df_clean['yearly_income_person']
    df_clean['debt_to_income_ratio'].replace([np.inf, -np.inf], np.nan, inplace=True)
    
    # Income vs area income
    df_clean['income_vs_area'] = df_clean['yearly_income_person'] / df_clean['per_capita_income_zipcode']
    df_clean['income_vs_area'].replace([np.inf, -np.inf], np.nan, inplace=True)
    
    # Years to retirement
    df_clean['years_to_retirement'] = df_clean['retirement_age'] - df_clean['current_age']
    
    # 5. Handle missing values
    # Fill numerical with median
    num_cols = ['current_age', 'latitude', 'longitude', 'per_capita_income_zipcode', 
                'yearly_income_person', 'total_debt', 'fico_score', 'num_credit_cards',
                'retirement_age', 'birth_year', 'birth_month']
    for col in num_cols:
        if col in df_clean.columns and df_clean[col].isna().any():
            df_clean[col].fillna(df_clean[col].median(), inplace=True)
    
    # Fill derived features with median
    df_clean['debt_to_income_ratio'].fillna(df_clean['debt_to_income_ratio'].median(), inplace=True)
    df_clean['income_vs_area'].fillna(df_clean['income_vs_area'].median(), inplace=True)
    df_clean['years_to_retirement'].fillna(df_clean['years_to_retirement'].median(), inplace=True)
    
    # Categorical
    df_clean['gender'].fillna('Unknown', inplace=True)
    df_clean['city'].fillna('Unknown', inplace=True)
    df_clean['state'].fillna('Unknown', inplace=True)
    
    return df_clean


# Process mapping 

# Main execution 

In [7]:
def validate_graph_structure(df_edges_clean, df_users_clean, df_merchants_clean):
    """
    Validate that the graph structure is consistent.
    """
    print("\n=== Graph Structure Validation ===")
    
    # Check user IDs
    unique_users_in_edges = df_edges_clean['user_id'].nunique()
    unique_users_in_nodes = df_users_clean['user_id'].nunique()
    users_in_edges_not_in_nodes = set(df_edges_clean['user_id'].unique()) - set(df_users_clean['user_id'].unique())
    
    print(f"Unique users in edges: {unique_users_in_edges}")
    print(f"Unique users in user nodes: {unique_users_in_nodes}")
    print(f"Users in edges but not in nodes: {len(users_in_edges_not_in_nodes)}")
    
    # Check merchant IDs
    unique_merchants_in_edges = df_edges_clean['merchant_id'].nunique()
    unique_merchants_in_nodes = df_merchants_clean['merchant_id'].nunique()
    merchants_in_edges_not_in_nodes = set(df_edges_clean['merchant_id'].unique()) - set(df_merchants_clean['merchant_id'].unique())
    
    print(f"\nUnique merchants in edges: {unique_merchants_in_edges}")
    print(f"Unique merchants in merchant nodes: {unique_merchants_in_nodes}")
    print(f"Merchants in edges but not in nodes: {len(merchants_in_edges_not_in_nodes)}")
    
    return len(users_in_edges_not_in_nodes) == 0 and len(merchants_in_edges_not_in_nodes) == 0


# Main execution function
def clean_all_dataframes(df_edges, df_nodes_user, df_nodes_merchant):
    """
    Clean all three DataFrames and ensure consistency.
    Returns: (df_edges_clean, df_users_clean, df_merchants_clean)
    """
    print("Cleaning edges (transactions)...")
    df_edges_clean = clean_edges_dataframe(df_edges)
    
    print("Cleaning user nodes...")
    df_users_clean = clean_user_nodes(df_nodes_user)
    
    print("Cleaning merchant nodes...")
    df_merchants_clean = clean_merchant_nodes(df_nodes_merchant)
    
    # Validation
    is_valid = validate_graph_structure(df_edges_clean, df_users_clean, df_merchants_clean)
    
    # Final summary
    print("\n=== Cleaning Summary ===")
    print(f"Edges shape: {df_edges_clean.shape}")
    print(f"User nodes shape: {df_users_clean.shape}")
    print(f"Merchant nodes shape: {df_merchants_clean.shape}")
    print(f"\nFraud cases in edges: {df_edges_clean['label'].sum()}")
    print(f"Fraud rate: {df_edges_clean['label'].mean():.2%}")
    
    # Check for missing values
    print(f"\nMissing values in edges:")
    missing_edges = df_edges_clean.isnull().sum()
    if missing_edges.sum() > 0:
        print(missing_edges[missing_edges > 0])
    else:
        print("None!")
    
    print(f"\nMissing values in user nodes:")
    missing_users = df_users_clean.isnull().sum()
    if missing_users.sum() > 0:
        print(missing_users[missing_users > 0])
    else:
        print("None!")
    
    print(f"\nMissing values in merchant nodes:")
    missing_merchants = df_merchants_clean.isnull().sum()
    if missing_merchants.sum() > 0:
        print(missing_merchants[missing_merchants > 0])
    else:
        print("None!")
    
    print(f"\nGraph structure valid: {is_valid}")
    
    return df_edges_clean, df_users_clean, df_merchants_clean

df_edges_clean, df_users_clean, df_merchants_clean = clean_all_dataframes(
    df_edges, df_nodes_user, df_nodes_merchant
)

Cleaning edges (transactions)...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean['years_since_pin_change'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean['debt_to_income_ratio'].replace([np.inf, -np.inf], np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the int

Cleaning user nodes...
Cleaning merchant nodes...

=== Graph Structure Validation ===
Unique users in edges: 100
Unique users in user nodes: 100
Users in edges but not in nodes: 0

Unique merchants in edges: 35838
Unique merchants in merchant nodes: 35838
Merchants in edges but not in nodes: 0

=== Cleaning Summary ===
Edges shape: (1258233, 22)
User nodes shape: (100, 22)
Merchant nodes shape: (35838, 6)

Fraud cases in edges: 1310
Fraud rate: 0.10%

Missing values in edges:


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean['debt_to_income_ratio'].fillna(df_clean['debt_to_income_ratio'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean['income_vs_area'].fillna(df_clean['income_vs_area'].median(), inplace=True)
The behavior will change in pandas 3.0. This inpla

None!

Missing values in user nodes:
apartment    68
dtype: int64

Missing values in merchant nodes:
None!

Graph structure valid: True


# Check after processing


In [8]:
display(df_edges_clean)
display(df_users_clean)
display(df_merchants_clean)

Unnamed: 0,user_id,card_id,merchant_id,card_index,label,amount,timestamp,transaction_hour,day_of_week,use_chip,...,card_type,card_number,cvv,has_chip,cards_issued,credit_limit,card_on_dark_web,account_age_days,years_since_pin_change,months_until_expiry
0,0,0,0,0,0,134.09,2002-09-01 06:21:00,6,6,Swipe Transaction,...,Debit,4344676511950444,623,1,2,24295.0,0,0,0,246.0
1,0,0,1,0,0,38.48,2002-09-01 06:42:00,6,6,Swipe Transaction,...,Debit,4344676511950444,623,1,2,24295.0,0,0,0,246.0
2,0,0,1,0,0,120.34,2002-09-02 06:22:00,6,0,Swipe Transaction,...,Debit,4344676511950444,623,1,2,24295.0,0,1,0,246.0
3,0,0,3,0,0,128.95,2002-09-02 17:45:00,17,0,Swipe Transaction,...,Debit,4344676511950444,623,1,2,24295.0,0,1,0,246.0
4,0,0,4,0,0,104.71,2002-09-03 06:23:00,6,1,Swipe Transaction,...,Debit,4344676511950444,623,1,2,24295.0,0,2,0,246.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1258228,99,1,1258163,1,0,28.02,2020-02-26 15:53:00,15,2,Chip Transaction,...,Debit,5073514573038606,802,1,1,3106.0,0,25,0,36.0
1258229,99,1,1258120,1,0,15.25,2020-02-27 12:44:00,12,3,Chip Transaction,...,Debit,5073514573038606,802,1,1,3106.0,0,26,0,36.0
1258230,99,1,245250,1,0,40.00,2020-02-27 20:04:00,20,3,Chip Transaction,...,Debit,5073514573038606,802,1,1,3106.0,0,26,0,36.0
1258231,99,1,962927,1,0,146.31,2020-02-28 06:33:00,6,4,Chip Transaction,...,Debit,5073514573038606,802,1,1,3106.0,0,27,0,36.0


Unnamed: 0,user_id,name,current_age,retirement_age,birth_year,birth_month,gender,address,apartment,city,...,latitude,longitude,per_capita_income_zipcode,yearly_income_person,total_debt,fico_score,num_credit_cards,debt_to_income_ratio,income_vs_area,years_to_retirement
0,0,Hazel Robinson,53,66,1966,11,Female,462 Rose Lane,,La Verne,...,34.15,-117.76,29278.0,59696.0,127613.0,787,5,2.137714,2.038937,13
1,1,Sasha Sadr,53,68,1966,12,Female,3606 Federal Boulevard,,Little Neck,...,40.76,-73.74,37891.0,77254.0,191349.0,701,5,2.476881,2.038848,15
2,2,Saanvi Lee,81,67,1938,11,Female,766 Third Drive,,West Covina,...,34.02,-117.89,22681.0,33483.0,196.0,698,5,0.005854,1.476258,-14
3,3,Everlee Clark,63,63,1957,1,Female,3 Madison Street,,New York,...,40.71,-73.99,163145.0,249925.0,202328.0,722,4,0.809555,1.531919,0
4,4,Kyle Peterson,43,70,1976,9,Male,9620 Valley Stream Drive,,San Francisco,...,37.76,-122.44,53797.0,109687.0,183855.0,675,1,1.676179,2.038906,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,Carla James,23,64,1996,5,Female,2418 Mill Drive,3999.0,San Jose,...,37.30,-122.00,41750.0,85128.0,108678.0,641,1,1.276642,2.038994,41
96,96,Kase Nguyen,54,69,1965,9,Male,3683 Fifth Street,7760.0,Mount Holly,...,35.30,-81.03,22487.0,45852.0,114746.0,615,2,2.502530,2.039045,15
97,97,Annalee Sanders,33,67,1986,7,Female,0 Elm Street,,Glendale,...,33.53,-112.18,13024.0,26556.0,43042.0,840,2,1.620801,2.039005,34
98,98,Journee Collins,63,67,1956,11,Female,2302 Catherine Lane,,Hollywood,...,26.02,-80.16,19712.0,40194.0,107765.0,738,6,2.681122,2.039062,4


Unnamed: 0,merchant_id,merchant_name,city,state,zipcode,mcc
0,0,3527213246127876953,La Verne,CA,91750.0,5300
1,1,-727612092139916043,Monterey Park,CA,91754.0,5411
2,3,3414527459579106770,Monterey Park,CA,91754.0,5651
3,4,5817218446178736267,La Verne,CA,91750.0,5912
4,5,-7146670748125200898,Monterey Park,CA,91755.0,5970
...,...,...,...,...,...,...
35833,1258191,-4191425613099110081,Littleton,CO,80163.0,7538
35834,1258196,-4710864390556265742,Littleton,CO,80162.0,5310
35835,1258213,-3265671264153192329,Garland,TX,75040.0,7832
35836,1258216,-4095516467351300576,Irving,TX,75038.0,5651


# Save

In [11]:
df_edges_clean.to_csv("../data/gold/u100_df_edges_clean.csv", index=False)
df_users_clean.to_csv("../data/gold/u100_df_nodes_user_clean.csv", index=False)
df_merchants_clean.to_csv("../data/gold/u100_df_nodes_merchant_clean.csv", index=False)