In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import neighbors
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
TEST_SIZE = 0.2
RANDOM_SEED = 42
# read csv
url = "https://raw.githubusercontent.com/Fefevs09/Transaction-Fraud-ML/main/data/treino.csv"
df = pd.read_csv(url,sep='|', encoding="utf-8" )
print(df.info())

# information about data
df.select_dtypes(include="object").describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144352 entries, 0 to 144351
Data columns (total 26 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   ssn         144352 non-null  object 
 1   cc_num      144352 non-null  int64  
 2   first       144352 non-null  object 
 3   last        144352 non-null  object 
 4   gender      144352 non-null  object 
 5   street      144352 non-null  object 
 6   city        144352 non-null  object 
 7   state       144352 non-null  object 
 8   zip         1468 non-null    float64
 9   lat         144352 non-null  float64
 10  long        144352 non-null  float64
 11  city_pop    144352 non-null  int64  
 12  job         144352 non-null  object 
 13  dob         144352 non-null  object 
 14  acct_num    144352 non-null  int64  
 15  profile     144352 non-null  object 
 16  trans_num   144194 non-null  object 
 17  trans_date  144194 non-null  object 
 18  trans_time  144194 non-null  object 
 19  un

Unnamed: 0,ssn,first,last,gender,street,city,state,job,dob,profile,trans_num,trans_date,trans_time,category,merchant
count,144352,144352,144352,144352,144352,144352,144352,144352,144352,144352,144194,144194,144194,144194,144194
unique,200,139,152,2,200,178,39,171,200,10,144194,367,68749,14,693
top,371-10-9244,Michael,Williams,F,72867 Angelica Spring Suite 064,Miami,CA,"Social research officer, government",1992-12-12,adults_2550_male_urban.json,556df8e74106df46d235d77659fb435f,2023-12-30,16:37:46,shopping_pos,fraud_Kilback LLC
freq,1791,6814,4434,73702,1791,3841,14224,3200,1791,43760,1,1039,10,14032,452


In [3]:
# drop collums don't usefull
df = df.dropna(subset="is_fraud")
df_train = df.drop(['ssn', 'cc_num', 'first', 'last', 'street', 'zip', 'unix_time', 'lat', 'long', 'acct_num', 'trans_num', 'merchant', 'merch_lat', 'merch_long', 'is_fraud'], axis=1)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 144194 entries, 0 to 144351
Data columns (total 11 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   gender      144194 non-null  object 
 1   city        144194 non-null  object 
 2   state       144194 non-null  object 
 3   city_pop    144194 non-null  int64  
 4   job         144194 non-null  object 
 5   dob         144194 non-null  object 
 6   profile     144194 non-null  object 
 7   trans_date  144194 non-null  object 
 8   trans_time  144194 non-null  object 
 9   category    144194 non-null  object 
 10  amt         144194 non-null  float64
dtypes: float64(1), int64(1), object(9)
memory usage: 13.2+ MB


In [4]:
# function to separate data datetime
def cleaning_data_datetime(data):
    # transform to datetime
    time_transaction = pd.to_datetime(data['trans_time'],format='%H:%M:%S')
    date_transaction = pd.to_datetime(data['trans_date'], format='%Y-%m-%d')

    # apply collumns
    data['hour'] = time_transaction.dt.hour
    data['minute'] = time_transaction.dt.minute
    data['seconds'] = time_transaction.dt.second
    data['year'] = date_transaction.dt.year
    data['month'] = date_transaction.dt.month
    data['day'] = date_transaction.dt.day
    data.drop(['trans_time', 'trans_date'], axis=1, inplace=True)

cleaning_data_datetime(df_train)
# function to get age of user
def get_user_age(data):
    year_user = pd.to_datetime(data['dob'], format="%Y-%m-%d")
    data['age'] = (data['year'] - year_user.dt.year)
    data.drop(['dob'], axis=1, inplace=True)
get_user_age(df_train)
# df_train.info()

# set street names to num
def set_street_num(data):
    data['street'] = data['street'].str.split().str[0]
    data['street'] = data['street'].astype(int)

# set_street_num(df_train)

In [5]:
# transform category columns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Supondo que 'df_train' seja o seu DataFrame original

# Extrair colunas categóricas
categorical_data = ['gender', 'job', 'profile', 'category', 'city', 'state']

# Inicializar LabelEncoder e OneHotEncoder
label_encoder = LabelEncoder()
onehot_encoder = OneHotEncoder()

# Aplicar LabelEncoder para cada coluna categórica
for column in categorical_data:
    df_train[column] = label_encoder.fit_transform(df_train[column])

# # Aplicar OneHotEncoder para todas as colunas categóricas
# encoded_features = onehot_encoder.fit_transform(df_train[categorical_data])

# # Converter as features one-hot em DataFrame pandas
# encoded_df = pd.DataFrame(encoded_features.toarray(), columns=onehot_encoder.get_feature_names_out(categorical_data))

# # Concatenar o DataFrame one-hot com o DataFrame original
# df_encoded = pd.concat([df_train.drop(columns=categorical_data), encoded_df], axis=1)

# # Separar os dados em conjuntos de treino e teste
from sklearn.model_selection import train_test_split
y = df['is_fraud']

X_train, X_test, y_train, y_test = train_test_split(df_train, y, test_size=TEST_SIZE, random_state=RANDOM_SEED)


In [6]:
# cleaning null values
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

imputer = SimpleImputer()
X_train_transform = imputer.fit_transform(X_train)

lab = LabelEncoder()
y_train_transformed = lab.fit_transform(y_train)

X_test_transform = imputer.transform(X_test)
y_test_transform = lab.transform(y_test)

In [7]:
# training models
# algorithms machine learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# model = RandomForestClassifier()
# model.fit(X_train, y_train_transformed)

models = {
    "KNN": KNeighborsClassifier(),
    "Logist Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier()
}

def fit_and_score(models, X_train, X_test, y_train, y_test):
    models_scores = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        models_scores[name] = model.score(X_test, y_test)
    return models_scores


score = fit_and_score(models=models, X_train=X_train_transform, X_test=X_test_transform, y_train=y_train_transformed, y_test=y_test_transform)
print(score)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'KNN': 0.9918859877249558, 'Logist Regression': 0.9885918374423524, 'Random Forest': 0.9968445507819272}


In [8]:

# hyperparameters adjustements and cross validation
from sklearn.model_selection import RandomizedSearchCV

train_score = []
test_score = []

rf_grid = {
    "n_estimators": np.arange(10, 1000, 50),
    "max_depth": [None, 3,5,10],
    "min_samples_split": np.arange(2,20,2),
    "min_samples_leaf": np.arange(1, 20, 2)
}
rscv_rf = RandomizedSearchCV(RandomForestClassifier(),
                             param_distributions=rf_grid,
                             cv=5,
                             n_iter=20,
                             verbose=True)

rscv_rf.fit(X_train_transform, y_train_transformed)

print(rscv_rf.best_params_)
print(rscv_rf.score(X_test_transform, y_test_transform))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
{'n_estimators': 110, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': None}
0.9966018239189985


In [9]:
# response
url_test = "https://raw.githubusercontent.com/Fefevs09/Transaction-Fraud-ML/main/data/teste.csv"
df = pd.read_csv(url_test, sep='|', encoding="utf-8")
df

Unnamed: 0,ssn,cc_num,first,last,gender,street,city,state,zip,lat,...,profile,trans_num,trans_date,trans_time,unix_time,category,amt,merchant,merch_lat,merch_long
0,359-72-3479,676334414486,Krista,Wang,F,556 Marilyn Fields,Saint Louis,MO,,38.63,...,adults_50up_female_urban.json,a4194096c6cc870b2e21b2b2f69a7706,2023-07-09,07:53:29,1.688900e+09,shopping_net,50.96,"fraud_Ruecker, Beer and Collier",,
1,145-16-0685,676296881433,Nicole,Berger,F,68222 Christina Glen Apt. 129,Celina,OH,,40.56,...,adults_2550_female_urban.json,f12645668a1a5a9f8192687f56095b5a,2023-09-16,15:33:04,1.694889e+09,entertainment,92.39,fraud_Schuppe LLC,,
2,802-90-3870,30280512927668,Cynthia,Alexander,F,95041 Gary Locks,Champaign,IL,,40.13,...,adults_2550_female_urban.json,7f8e03dcf31fbbb5a3547ed8c40fa54b,2023-07-06,16:39:59,1.688672e+09,shopping_net,5.32,"fraud_Little, Gutmann and Lynch",,
3,234-29-2150,4982150648900,Linda,Williams,F,001 Wallace Crossing,Calhan,CO,,38.96,...,adults_50up_female_urban.json,5f3cb674358918b82dbb5c12e60615ef,2023-04-10,02:45:30,1.681106e+09,gas_transport,119.82,"fraud_Reilly, Heaney and Cole",,
4,802-90-3870,30280512927668,Cynthia,Alexander,F,95041 Gary Locks,Champaign,IL,,40.13,...,adults_2550_female_urban.json,a6537f4cfe7b5831fa481d520621b72d,2023-03-16,09:08:48,1.678969e+09,gas_transport,41.59,fraud_Kling Inc,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36042,013-38-6780,6011839826224441,Johnny,Brown,M,6471 King Gateway Suite 993,Davis City,IA,,40.63,...,adults_50up_male_rural.json,9864161f542d03b0054fef53c3abb273,2023-06-29,03:02:23,1.688019e+09,misc_net,12.24,fraud_Corwin-Gorczany,,
36043,883-36-4279,4328309652074461,Jennifer,Doyle,F,3875 Bruce Ville,Walnut Grove,MN,,44.23,...,adults_2550_female_rural.json,f4e47acc6e0329a51606b04d630fe54b,2023-11-18,14:46:13,1.700330e+09,health_fitness,9.24,"fraud_Hyatt, Russel and Gleichner",,
36044,795-93-3171,4212281606484229407,Erin,Mckinney,F,25909 Henry Ports,Columbia,MD,,39.21,...,young_adults_female_urban.json,3ddceea33fc57d78ea462439271f8392,2023-12-02,18:36:32,1.701553e+09,health_fitness,186.21,fraud_Romaguera Ltd,,
36045,506-31-3554,3576719903849837,Anthony,Hernandez,M,20994 Knapp Causeway Apt. 622,Greensboro,NC,,36.18,...,adults_2550_male_urban.json,582e6c05bf038dedb8a301c092b3d09f,2023-06-04,02:48:45,1.685858e+09,grocery_pos,31.19,fraud_Murray-Smitham,,


In [11]:
df_test = df.drop(['ssn', 'cc_num', 'first', 'last', 'street', 'zip', 'unix_time', 'lat', 'long', 'acct_num', 'trans_num', 'merchant', 'merch_lat', 'merch_long'], axis=1)
df_test.info()
cleaning_data_datetime(df_test)
get_user_age(df_test)

# set_street_num(df_test)
for column in categorical_data:
    df_test[column] = label_encoder.fit_transform(df_test[column])

#
# y_sub = knn.predict(X_train)
y_sub = rscv_rf.predict(df_test)

# generate response dataframe
response = pd.DataFrame()
response['trans_num'] =  df['trans_num']
response['is_fraud'] = y_sub
# response['is_fraud'].value_counts()
response.to_csv('../data/submissao.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36047 entries, 0 to 36046
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   gender      36047 non-null  object 
 1   city        36047 non-null  object 
 2   state       36047 non-null  object 
 3   city_pop    36047 non-null  int64  
 4   job         36047 non-null  object 
 5   dob         36047 non-null  object 
 6   profile     36047 non-null  object 
 7   trans_date  36047 non-null  object 
 8   trans_time  36047 non-null  object 
 9   category    36047 non-null  object 
 10  amt         36047 non-null  float64
dtypes: float64(1), int64(1), object(9)
memory usage: 3.0+ MB


