In [12]:
import pandas as pd
import json
import datetime

In [13]:
df = pd.read_csv('~/fraudTrain.csv')

In [14]:
df = df.drop('Unnamed: 0', axis=1)
df = df.drop('trans_num', axis=1)
df = df.drop('cc_num', axis=1)
df = df.drop('unix_time', axis=1)

In [15]:
mapping_dict = {1: 'Yes', 0: 'No'}
df['is_fraud'] = df['is_fraud'].map(mapping_dict)

In [16]:
mapping_dict = {'M': 'Male', 'F': 'Female'}
df['gender'] = df['gender'].map(mapping_dict)

## List Template Serialization

In [16]:
def list_template_serialization(row):
    return {col: row[col] for col in row.index}

serialized_data = df.apply(list_template_serialization, axis=1)

with open('./serialized_dataset/list_template_serialized_data.jsonl', 'w') as file:
    for row in serialized_data:
        json.dump(row, file)
        file.write('\n')


## Text Template Serialization

In [None]:
def text_template_serialization(row):
    return '. '.join([f'The {col} is {row[col]}' for col in row.index]) + '.'


serialized_data = df.apply(text_template_serialization, axis=1)


with open('./serialized_dataset/text_template_serialized_data.jsonl', 'w') as file:
    for row in serialized_data:
        file.write(json.dumps({"data": row}) + '\n')

## Manual Template Serialization

In [17]:
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'], format='%Y-%m-%d %H:%M:%S')
df['dob'] = pd.to_datetime(df['dob'], format='%Y-%m-%d')

In [18]:
current_date = datetime.datetime.now().date()
df['age'] = current_date.year - df['dob'].dt.year

In [19]:
#Sampling dataset to reduce rows

fraud_yes = df[df['is_fraud'] == 'Yes'].sample(n=500, random_state=42)
fraud_no = df[df['is_fraud'] == 'No'].sample(n=500, random_state=42)
merged_df = pd.concat([fraud_yes, fraud_no])
merged_df = merged_df.reset_index(drop=True)

In [20]:
df = merged_df.copy()

In [21]:
def map_df_to_template_and_store(df, file_path):
    with open(file_path, 'w') as file:
        for _, row in df.iterrows():
            record = (
                f"The date and time of transaction is {row['trans_date_trans_time']} "
                f"The merchant is {row['merchant']} "
                f"and transaction category is {row['category']}. "
                f"The transaction amount is {row['amt']}. "
                f"First name of customer is {row['first']} "
                f"and last name is {row['last']}, "
                f"gender of the customer is {row['gender']} "
                f"and lives in {row['street']}, {row['city']}, {row['state']} having zipcode {row['zip']}. "
                f"Latitude and Longitude of the location is {row['lat']} and {row['long']}. "
                f"The city population is {row['city_pop']}. "
                f"The customer is {row['job']} "
                f"and has date of birth {row['dob']} and age is {row['age']}. "
                f"Latitude and Longitude of merchant location is {row['merch_lat']} and {row['merch_long']}."
            )
            output = (f"Is there a fraud? {row['is_fraud']}")
            json_record = json.dumps({"record": record, "output": output})
            file.write(json_record + '\n')

In [22]:
map_df_to_template_and_store(df, './serialized_dataset/manual_template_serialization.jsonl')