In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import joblib
from scipy.stats import chi2_contingency
import numpy as np

In [40]:
data = pd.read_csv('transaction_data_updated.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 781207 entries, 0 to 781206
Data columns (total 24 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Unnamed: 0                781207 non-null  int64  
 1   accountNumber             781207 non-null  int64  
 2   customerId                781207 non-null  int64  
 3   creditLimit               781207 non-null  float64
 4   availableMoney            781207 non-null  float64
 5   transactionDateTime       781207 non-null  object 
 6   transactionAmount         781207 non-null  float64
 7   merchantName              781207 non-null  object 
 8   acqCountry                781207 non-null  object 
 9   merchantCountryCode       781207 non-null  object 
 10  posEntryMode              781207 non-null  int64  
 11  posConditionCode          781207 non-null  int64  
 12  merchantCategoryCode      781207 non-null  object 
 13  currentExpDate            781207 non-null  o

In [41]:
data.dropna(axis=0, inplace=True)

In [42]:
target_column = 'isFraud'

In [43]:
categorical_columns = data.select_dtypes(include=['object']).columns
# Calculate associations using Cramér's V
associations = {}
for column in categorical_columns:
    contingency_table = pd.crosstab(data[column], data[target_column])
    chi2, _, _, _ = chi2_contingency(contingency_table)
    n = contingency_table.sum().sum()
    phi2 = chi2 / n
    min_dim = min(contingency_table.shape) - 1
    associations[column] = np.sqrt(phi2 / min_dim)

In [44]:
association_threshold = 0.1
# Drop columns with associations below the threshold
columns_to_drop = [col for col, association in associations.items() if association < association_threshold]
data.drop(columns=columns_to_drop, inplace=True)

In [45]:
# Select numeric columns
numeric_columns = data.select_dtypes(include=['int', 'float']).columns

# Calculate correlations for numeric variables
correlations = data[numeric_columns].corrwith(data[target_column])

# Define the correlation threshold
correlation_threshold = 0.005

In [46]:
columns_to_drop = [col for col, correlation in correlations.items() if abs(correlation) < correlation_threshold]
data.drop(columns=columns_to_drop, inplace=True)

In [47]:
boolean_columns = data.select_dtypes(include=['bool']).columns
data[boolean_columns] = data[boolean_columns].astype(np.int64)

In [48]:
columns = data.columns
same_values_found = False

In [49]:
for i in range(len(columns)):
    for j in range(i + 1, len(columns)):
        if (data[columns[i]] == data[columns[j]]).all():
            same_values_found = True
            data.drop(columns=[columns[j]], inplace=True)

In [50]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 781207 entries, 0 to 781206
Data columns (total 13 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   transactionDateTime       781207 non-null  object 
 1   transactionAmount         781207 non-null  float64
 2   merchantName              781207 non-null  object 
 3   posEntryMode              781207 non-null  int64  
 4   posConditionCode          781207 non-null  int64  
 5   accountOpenDate           781207 non-null  object 
 6   dateOfLastAddressChange   781207 non-null  object 
 7   cardCVV                   781207 non-null  int64  
 8   enteredCVV                781207 non-null  int64  
 9   currentBalance            781207 non-null  float64
 10  cardPresent               781207 non-null  int64  
 11  expirationDateKeyInMatch  781207 non-null  int64  
 12  isFraud                   781207 non-null  int64  
dtypes: float64(2), int64(7), object(4)
memory us

In [51]:
data.to_csv('semi-treated_data.csv', index=False)

In [52]:
# Specify the ISO 8601 datetime format
iso_format = "%Y-%m-%dT%H:%M:%S"

# Find columns containing the ISO 8601 format
iso_columns = data.select_dtypes(include=['object']).apply(
    lambda x: pd.to_datetime(x, format=iso_format, errors='coerce').notnull().all()
)

# Convert the ISO 8601 columns to Unix timestamps
for col in iso_columns[iso_columns].index:
    data[col] = pd.to_datetime(data[col], format=iso_format)
    data[col + '_unix'] = (data[col] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
    data[col] = data[col + '_unix']
    data.drop(columns=[col + '_unix'], inplace=True)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 781207 entries, 0 to 781206
Data columns (total 13 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   transactionDateTime       781207 non-null  int64  
 1   transactionAmount         781207 non-null  float64
 2   merchantName              781207 non-null  object 
 3   posEntryMode              781207 non-null  int64  
 4   posConditionCode          781207 non-null  int64  
 5   accountOpenDate           781207 non-null  int64  
 6   dateOfLastAddressChange   781207 non-null  int64  
 7   cardCVV                   781207 non-null  int64  
 8   enteredCVV                781207 non-null  int64  
 9   currentBalance            781207 non-null  float64
 10  cardPresent               781207 non-null  int64  
 11  expirationDateKeyInMatch  781207 non-null  int64  
 12  isFraud                   781207 non-null  int64  
dtypes: float64(2), int64(10), object(1)
memory u

In [53]:
from sklearn.preprocessing import LabelEncoder
# Initialize LabelEncoder
label_encoder = LabelEncoder()
data['merchantName'] = label_encoder.fit_transform(data['merchantName'])

In [54]:
data.head()

Unnamed: 0,transactionDateTime,transactionAmount,merchantName,posEntryMode,posConditionCode,accountOpenDate,dateOfLastAddressChange,cardCVV,enteredCVV,currentBalance,cardPresent,expirationDateKeyInMatch,isFraud
0,1471098452,98.55,2085,2,1,1426291200,1426291200,414,414,0.0,0,0,0
1,1476162354,74.51,27,9,1,1426291200,1426291200,486,486,0.0,1,0,0
2,1478596719,7.47,1304,9,1,1426291200,1426291200,486,486,0.0,0,0,0
3,1481336090,7.47,1304,9,1,1426291200,1426291200,486,486,0.0,0,0,0
4,1458853486,71.18,2083,2,1,1438819200,1438819200,885,885,0.0,1,0,0


In [55]:
data.to_csv('fully-treated_data.csv', index=False)