In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [15]:
df = pd.read_csv('../data/raw/fraud_data.csv')

In [46]:
class statistics:
    """_summary_
    """

    def __init__(self, df):
        self.df = df
        
    def get_types(self):
        return df.dtypes.value_counts()
    
    def get_label_statistics(self):
        label_values = self.df['is_fraud'].value_counts().index.to_list()
        print(f"Label values encountered: {label_values}")

        not_str_vals = df['is_fraud'][~df['is_fraud'].isin(['0', '1']).values]
        print(f"Not string values: {not_str_vals}")

        print(f"Not string values count: {not_str_vals.count()}")

    def consolidate(self):
        print("#####Types#####")
        print(self.get_types())

        print("#####Label#####")
        self.get_label_statistics()

In [47]:
stats = statistics(df)
stats.consolidate()

#####Types#####
object     9
float64    5
int64      1
Name: count, dtype: int64
#####Label#####
Label values encountered: ['0', '1', '1"2020-12-24 16:56:24"', '0"2019-01-01 00:00:44"']
Not string values: 1781    1"2020-12-24 16:56:24"
7780    0"2019-01-01 00:00:44"
Name: is_fraud, dtype: object
Not string values count: 2


In [10]:
class prepare_data:
    """_summary_
    """
    def __init__(self, state):
        self.df = pd.read_csv('../data/raw/fraud_data.csv')
        self.state = state

    def one_hot_encode(self)->pd.DataFrame:
        """Perform one hot encoding on the categorical columns

        Returns:
            pd.DataFrame: Data Frame with new encoded columns
        """
        df_res = self.get_state_df(self.df.copy())
        categorical_columns = list(df_res.select_dtypes(include=['object'])) # Ignora is_fraud
        cols_ignore = ['trans_date_trans_time', 'state', 'trans_num', 'dob', 'is_fraud']
        categorical_columns = [x for x in categorical_columns if x not in cols_ignore]
        one_hot_encoded_df = pd.get_dummies(df_res, columns=categorical_columns, dtype='int')

        one_hot_encoded_df = one_hot_encoded_df.drop(columns=cols_ignore[:-1], axis=1)
        one_hot_encoded_df['state'] = df_res['state']
        return one_hot_encoded_df
    
    def convert_to_int(self, value):
        """Convert str values to int.

        Args:
            value (str, object): column value

        Returns:
            int: converted value
        """
        try:
            return int(value)  # Tentar converter para inteiro
        except (ValueError, TypeError):
            return None  # Substituir valores inválidos por None
    
    def fix_data_types(self)->pd.DataFrame:
        """Fix the data type on the label column(is_fraud).

        Returns:
            pd.DataFrame: DataFrame ready for modeling.
        """
        df_res = self.one_hot_encode()
        label = df_res['is_fraud']
        print(f"Invalid: {label[~label.isin(['0', '1'])].index}")

        df_res = df_res.iloc[label[label.isin(['0', '1'])].index] ## Keep only valid data
        label = label.apply(self.convert_to_int)
        df_res['is_fraud'] = label
        return df_res
    
    def get_state_df(self, df):
        """Filter the DataFrame by state.

        Args:
            df (pd.DataFrame): DataFrame to filter

        Returns:
            pd.DataFrame: Filtered DataFrame
        """
        df_to_save = df
        return df_to_save[df_to_save['state'] == self.state].reset_index(drop=True)

    def run(self):
        """Execute all the steps to prepare the data. Save inside the processed folder.
        """
        df_model = self.fix_data_types().sample(frac=1)
        df_model.to_csv(f'../data/processed/train_{self.state}.csv', index=False)

    def simplified_version(self):
        """Save a simplified version of the DF
           FOR TEST ONLY
        """
        df_to_save = self.fix_data_types()
        df_to_save = df_to_save[['amt', 'city_pop', 'lat', 'long', 'is_fraud', 'state']].sample(frac=1)
        df_to_save[df_to_save['state'] == self.state].to_csv(f'../data/processed/train_{self.state}_simp.csv', index=False)

    def get_json_format(self, df):
        """Convert from DataFrame to JSON format. 
           FOR TEST ONLY
        """

        df_json = df.to_json(orient='records')
        return df_json


In [7]:
df = pd.read_csv('../data/raw/fraud_data.csv')
states = df['state'].unique()

In [11]:
for state in states:
    prep = prepare_data(state)
    prep.run()

Invalid: Index([], dtype='int64')
Invalid: Index([], dtype='int64')
Invalid: Index([], dtype='int64')
Invalid: Index([401], dtype='int64')
Invalid: Index([], dtype='int64')
Invalid: Index([], dtype='int64')
Invalid: Index([], dtype='int64')
Invalid: Index([], dtype='int64')
Invalid: Index([], dtype='int64')
Invalid: Index([], dtype='int64')
Invalid: Index([], dtype='int64')
Invalid: Index([168], dtype='int64')
Invalid: Index([], dtype='int64')
