In [8]:
import pandas as pd
import joblib
from datetime import datetime
from sklearn.preprocessing import LabelEncoder

In [38]:
df = pd.read_csv('../data/raw/fraud_data.csv')

In [46]:
class statistics:
    """_summary_
    """

    def __init__(self, df):
        self.df = df
        
    def get_types(self):
        return df.dtypes.value_counts()
    
    def get_label_statistics(self):
        label_values = self.df['is_fraud'].value_counts().index.to_list()
        print(f"Label values encountered: {label_values}")

        not_str_vals = df['is_fraud'][~df['is_fraud'].isin(['0', '1']).values]
        print(f"Not string values: {not_str_vals}")

        print(f"Not string values count: {not_str_vals.count()}")

    def consolidate(self):
        print("#####Types#####")
        print(self.get_types())

        print("#####Label#####")
        self.get_label_statistics()

In [47]:
stats = statistics(df)
stats.consolidate()

#####Types#####
object     9
float64    5
int64      1
Name: count, dtype: int64
#####Label#####
Label values encountered: ['0', '1', '1"2020-12-24 16:56:24"', '0"2019-01-01 00:00:44"']
Not string values: 1781    1"2020-12-24 16:56:24"
7780    0"2019-01-01 00:00:44"
Name: is_fraud, dtype: object
Not string values count: 2


In [53]:
def get_test_sample(df):
    """Get a sample of the data for testing.
    
    Args:
        df (pd.DataFrame): DataFrame with the data to be sampled
        state (str): State to be sampled
    
    Returns:
        pd.DataFrame: DataFrame with the sampled data
    """
    df_to_save = df

    random_samples = df_to_save.groupby('state').apply(lambda x: x.sample(2))
    indices_to_remove = random_samples.index.get_level_values(1)
    df_to_save = df_to_save.drop(indices_to_remove)
    df_to_save.to_csv(f'../data/processed/fraud_data_train.csv', index=False)
    return random_samples
test =  get_test_sample(df)

  random_samples = df_to_save.groupby('state').apply(lambda x: x.sample(2))


In [48]:
random_samples = df.groupby('state').apply(lambda x: x.sample(2))
indices_to_remove = random_samples.index.get_level_values(1)
df.drop(indices_to_remove).shape

  random_samples = df.groupby('state').apply(lambda x: x.sample(2))


(14420, 15)

In [9]:
class prepare_data:
    """Class responsible for preparing the data for training and prediction.
    """
    def __init__(self, state):
        self.df = pd.read_csv('../data/raw/fraud_data.csv')
        self.state = state

    def job_encode(self)->pd.DataFrame:
        """Perform one hot encoding on the categorical columns

        Returns:
            pd.DataFrame: Data Frame with new encoded columns
        """
        df_res = self.get_state_df(self.df.copy())

        encoder = LabelEncoder()
        encoded_column = encoder.fit_transform(df_res['job'])

        df_res['job'] = encoded_column

        joblib.dump(encoder, f'../model/artifacts/encoder_{self.state}.pkl')
        print("Encoder Salvo!")


        return df_res
    
    def convert_to_int(self, value):
        """Convert str values to int.

        Args:
            value (str, object): column value

        Returns:
            int: converted value
        """
        try:
            return int(value)  # Tentar converter para inteiro
        except (ValueError, TypeError):
            return None  # Substituir valores inválidos por None
        
    def get_age_trans_time(self, year, bd):
        return year - bd
        
    def get_age(self, df):
        df['year_trans'] = df['trans_date_trans_time'].apply(lambda t: int(t[6:10]))
        df['bt_year'] = df['dob'].apply(lambda bd: int(bd[-4:]))

        df['age_trans_time'] = df.apply(lambda x: self.get_age_trans_time(x.year_trans, x.bt_year), axis=1)
        df['dob'].apply(lambda bd: datetime.now().year - int(bd[-4:]))

        return df
    
    def age_cat(self, x):
        if x < 18:
            return '0-18'
        elif x < 35:
            return '18-35'
        elif x < 60:
            return '35-60'
        else:
            return '60+'
        
    def get_age_categorical(self, df):
        df['age_cat'] = df.apply(lambda x: self.age_cat(x.age_trans_time), axis=1)
    
    def fix_data_types(self)->pd.DataFrame:
        """Fix the data type on the label column(is_fraud).

        Returns:
            pd.DataFrame: DataFrame ready for modeling.
        """
        df_res = self.job_encode()
        label = df_res['is_fraud']
        print(f"Invalid: {label[~label.isin(['0', '1'])].index}")

        df_res = df_res.iloc[label[label.isin(['0', '1'])].index] ## Keep only valid data
        label = label.apply(self.convert_to_int)
        df_res['is_fraud'] = label
        return df_res
    
    def get_state_df(self, df):
        """Filter the DataFrame by state.

        Args:
            df (pd.DataFrame): DataFrame to filter

        Returns:
            pd.DataFrame: Filtered DataFrame
        """
        df_to_save = df
        return df_to_save[df_to_save['state'] == self.state].reset_index(drop=True)

    def run(self):
        """Execute all the steps to prepare the data. Save inside the processed folder.
        """
        cols_ignore = ['trans_date_trans_time', 'merchant', 'category', 'city', 'trans_num', 'dob']
        df_model = self.fix_data_types().sample(frac=1)
        df_model = df_model.drop(columns=cols_ignore, axis=1)
        df_model.to_csv(f'../data/processed/train_{self.state}.csv', index=False)

    def simplified_version(self):
        """Save a simplified version of the DF
           FOR TEST ONLY
        """
        df_to_save = self.fix_data_types()
        df_to_save = df_to_save[['amt', 'city_pop', 'lat', 'long', 'is_fraud', 'state']].sample(frac=1)
        df_to_save[df_to_save['state'] == self.state].to_csv(f'../data/processed/train_{self.state}_simp.csv', index=False)

    def get_json_format(self, df):
        """Convert from DataFrame to JSON format. 
           FOR TEST ONLY
        """

        df_json = df.to_json(orient='records')
        return df_json


In [10]:
prep = prepare_data(state='AK')
prep.run()

Encoder Salvo!
Invalid: Index([], dtype='int64')


In [56]:
test[test['state'] == 'AK'].to_json(orient='records')

'[{"trans_date_trans_time":"23-03-2019 01:09","merchant":"\\"Greenholt, Jacobi and Gleason\\"","category":"gas_transport","amt":9.94,"city":"Kaktovik","state":"AK","lat":66.6933,"long":-153.994,"city_pop":239,"job":"Careers information officer","dob":"01-04-1996","trans_num":"da81318af6e1918b067de24bbd9744d5","merch_lat":66.252098,"merch_long":-154.718147,"is_fraud":"1"},{"trans_date_trans_time":"08-01-2019 22:49","merchant":"Abshire PLC","category":"entertainment","amt":17.85,"city":"Wales","state":"AK","lat":64.7556,"long":-165.6723,"city_pop":145,"job":"\\"Administrator, education\\"","dob":"09-11-1939","trans_num":"5c9634262e76f3e5df1feff31d570c88","merch_lat":64.859572,"merch_long":-166.34388,"is_fraud":"0"}]'

In [57]:
test[test['state'] == 'CA'].to_json(orient='records')

'[{"trans_date_trans_time":"19-01-2019 16:14","merchant":"Kerluke-Abshire","category":"shopping_net","amt":52.41,"city":"Glendale","state":"CA","lat":34.1556,"long":-118.2322,"city_pop":172817,"job":"Advertising account planner","dob":"30-07-1982","trans_num":"8856ef90738aa96672fef64982e487ce","merch_lat":34.60708,"merch_long":-118.297407,"is_fraud":"0"},{"trans_date_trans_time":"03-07-2019 22:46","merchant":"\\"Schmeler, Bashirian and Price\\"","category":"shopping_net","amt":921.46,"city":"Grenada","state":"CA","lat":41.6125,"long":-122.5258,"city_pop":589,"job":"Systems analyst","dob":"21-12-1945","trans_num":"1f9e8c49fffd6f4127ef70bb9229c574","merch_lat":40.704872,"merch_long":-122.724311,"is_fraud":"1"}]'