In [2]:
import pandas as pd
import joblib
from sklearn.preprocessing import LabelEncoder

In [4]:
df = pd.read_csv('../data/raw/fraud_data.csv')

In [46]:
class statistics:
    """_summary_
    """

    def __init__(self, df):
        self.df = df
        
    def get_types(self):
        return df.dtypes.value_counts()
    
    def get_label_statistics(self):
        label_values = self.df['is_fraud'].value_counts().index.to_list()
        print(f"Label values encountered: {label_values}")

        not_str_vals = df['is_fraud'][~df['is_fraud'].isin(['0', '1']).values]
        print(f"Not string values: {not_str_vals}")

        print(f"Not string values count: {not_str_vals.count()}")

    def consolidate(self):
        print("#####Types#####")
        print(self.get_types())

        print("#####Label#####")
        self.get_label_statistics()

In [47]:
stats = statistics(df)
stats.consolidate()

#####Types#####
object     9
float64    5
int64      1
Name: count, dtype: int64
#####Label#####
Label values encountered: ['0', '1', '1"2020-12-24 16:56:24"', '0"2019-01-01 00:00:44"']
Not string values: 1781    1"2020-12-24 16:56:24"
7780    0"2019-01-01 00:00:44"
Name: is_fraud, dtype: object
Not string values count: 2


In [9]:
class prepare_data:
    """Class responsible for preparing the data for training and prediction.
    """
    def __init__(self, state):
        self.df = pd.read_csv('../data/raw/fraud_data.csv')
        self.state = state

    def job_encode(self)->pd.DataFrame:
        """Perform one hot encoding on the categorical columns

        Returns:
            pd.DataFrame: Data Frame with new encoded columns
        """
        df_res = self.get_state_df(self.df.copy())

        encoder = LabelEncoder()
        encoded_column = encoder.fit_transform(df_res['job'])

        df_res['job'] = encoded_column

        joblib.dump(encoder, f'../model/artifacts/encoder_{self.state}.pkl')
        print("Encoder Salvo!")


        return df_res
    
    def convert_to_int(self, value):
        """Convert str values to int.

        Args:
            value (str, object): column value

        Returns:
            int: converted value
        """
        try:
            return int(value)  # Tentar converter para inteiro
        except (ValueError, TypeError):
            return None  # Substituir valores inválidos por None
    
    def fix_data_types(self)->pd.DataFrame:
        """Fix the data type on the label column(is_fraud).

        Returns:
            pd.DataFrame: DataFrame ready for modeling.
        """
        df_res = self.job_encode()
        label = df_res['is_fraud']
        print(f"Invalid: {label[~label.isin(['0', '1'])].index}")

        df_res = df_res.iloc[label[label.isin(['0', '1'])].index] ## Keep only valid data
        label = label.apply(self.convert_to_int)
        df_res['is_fraud'] = label
        return df_res
    
    def get_state_df(self, df):
        """Filter the DataFrame by state.

        Args:
            df (pd.DataFrame): DataFrame to filter

        Returns:
            pd.DataFrame: Filtered DataFrame
        """
        df_to_save = df
        return df_to_save[df_to_save['state'] == self.state].reset_index(drop=True)

    def run(self):
        """Execute all the steps to prepare the data. Save inside the processed folder.
        """
        cols_ignore = ['trans_date_trans_time', 'merchant', 'category', 'city', 'trans_num', 'dob']
        df_model = self.fix_data_types().sample(frac=1)
        df_model = df_model.drop(columns=cols_ignore, axis=1)
        df_model.to_csv(f'../data/processed/train_{self.state}.csv', index=False)

    def simplified_version(self):
        """Save a simplified version of the DF
           FOR TEST ONLY
        """
        df_to_save = self.fix_data_types()
        df_to_save = df_to_save[['amt', 'city_pop', 'lat', 'long', 'is_fraud', 'state']].sample(frac=1)
        df_to_save[df_to_save['state'] == self.state].to_csv(f'../data/processed/train_{self.state}_simp.csv', index=False)

    def get_json_format(self, df):
        """Convert from DataFrame to JSON format. 
           FOR TEST ONLY
        """

        df_json = df.to_json(orient='records')
        return df_json


In [10]:
prep = prepare_data(state='AK')
prep.run()

Encoder Salvo!
Invalid: Index([], dtype='int64')


In [85]:
df.head(1).shape

(1, 15)

In [60]:
df = pd.read_csv('../data/raw/fraud_data.csv')
categorical_columns = df.select_dtypes(include=['object']).columns
numerical_columns = df.select_dtypes(exclude=['object']).columns

cols_ignore = ['trans_date_trans_time', 'state', 'trans_num', 'dob', 'is_fraud']
categorical_columns = [x for x in categorical_columns if x not in cols_ignore]

In [70]:
categorical_columns

['merchant', 'category', 'city', 'job']

In [63]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_columns = encoder.fit_transform(df[categorical_columns])

In [64]:
encoded_column_names = encoder.get_feature_names_out(categorical_columns)
encoded_df = pd.DataFrame(encoded_columns, columns=encoded_column_names)

In [66]:
final_df = pd.concat([df[numerical_columns], encoded_df], axis=1)

In [11]:
for state in states:
    prep = prepare_data(state)
    prep.run()

Invalid: Index([], dtype='int64')
Invalid: Index([], dtype='int64')
Invalid: Index([], dtype='int64')
Invalid: Index([401], dtype='int64')
Invalid: Index([], dtype='int64')
Invalid: Index([], dtype='int64')
Invalid: Index([], dtype='int64')
Invalid: Index([], dtype='int64')
Invalid: Index([], dtype='int64')
Invalid: Index([], dtype='int64')
Invalid: Index([], dtype='int64')
Invalid: Index([168], dtype='int64')
Invalid: Index([], dtype='int64')


In [51]:
print(df.to_json(orient='records'))

[{"trans_date_trans_time":"04-01-2019 00:58","merchant":"\"Stokes, Christiansen and Sipes\"","category":"grocery_net","amt":14.37,"city":"Wales","state":"AK","lat":64.7556,"long":-165.6723,"city_pop":145,"job":"\"Administrator, education\"","dob":"09-11-1939","trans_num":"a3806e984cec6ac0096d8184c64ad3a1","merch_lat":65.654142,"merch_long":-164.722603,"is_fraud":"1"},{"trans_date_trans_time":"04-01-2019 15:06","merchant":"Predovic Inc","category":"shopping_net","amt":966.11,"city":"Wales","state":"AK","lat":64.7556,"long":-165.6723,"city_pop":145,"job":"\"Administrator, education\"","dob":"09-11-1939","trans_num":"a59185fe1b9ccf21323f581d7477573f","merch_lat":65.468863,"merch_long":-165.473127,"is_fraud":"1"},{"trans_date_trans_time":"04-01-2019 22:37","merchant":"Wisozk and Sons","category":"misc_pos","amt":49.61,"city":"Wales","state":"AK","lat":64.7556,"long":-165.6723,"city_pop":145,"job":"\"Administrator, education\"","dob":"09-11-1939","trans_num":"86ba3a888b42cd3925881fa34177b4e

In [86]:
df.head(1).to_json(orient='records')

'[{"trans_date_trans_time":"04-01-2019 00:58","merchant":"\\"Stokes, Christiansen and Sipes\\"","category":"grocery_net","amt":14.37,"city":"Wales","state":"AK","lat":64.7556,"long":-165.6723,"city_pop":145,"job":"\\"Administrator, education\\"","dob":"09-11-1939","trans_num":"a3806e984cec6ac0096d8184c64ad3a1","merch_lat":65.654142,"merch_long":-164.722603,"is_fraud":"1"}]'

In [54]:
df.head(1)

Unnamed: 0,trans_date_trans_time,merchant,category,amt,city,state,lat,long,city_pop,job,dob,trans_num,merch_lat,merch_long,is_fraud
0,04-01-2019 00:58,"""Stokes, Christiansen and Sipes""",grocery_net,14.37,Wales,AK,64.7556,-165.6723,145,"""Administrator, education""",09-11-1939,a3806e984cec6ac0096d8184c64ad3a1,65.654142,-164.722603,1
