In [1]:
import pandas as pd
import datetime
import numpy as np
import gym
from gym import spaces

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

# Building the Traditional Model

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, RobustScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier


In [2]:
df = pd.read_csv('Fraud_Data.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   user_id         151112 non-null  int64  
 1   signup_time     151112 non-null  object 
 2   purchase_time   151112 non-null  object 
 3   purchase_value  151112 non-null  int64  
 4   device_id       151112 non-null  object 
 5   source          151112 non-null  object 
 6   browser         151112 non-null  object 
 7   sex             151112 non-null  object 
 8   age             151112 non-null  int64  
 9   ip_address      151112 non-null  float64
 10  class           151112 non-null  int64  
dtypes: float64(1), int64(4), object(6)
memory usage: 12.7+ MB


In [3]:
df.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0


In [21]:
# feature engineering

df['signup_date'] = pd.to_datetime(df['signup_time'])
df['purchase_time'] = pd.to_datetime(df['purchase_time'])
df['time_Since_signup'] = (df['purchase_time'] - df['signup_date']).dt.total_seconds()
df['time_of_day'] =df['purchase_time'].dt.hour

In [5]:
df.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,signup_date,time_Since_signup,time_of_day
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0,2015-02-24 22:55:49,4506682.0,2
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0,2015-06-07 20:39:50,17944.0,1
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1,2015-01-01 18:52:44,1.0,18
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0,2015-04-28 21:13:25,492085.0,13
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0,2015-07-21 07:09:52,4361461.0,18


In [22]:
df['source'].unique()

array(['SEO', 'Ads', 'Direct'], dtype=object)

In [23]:
#select features for state representation

features = ['source', 'time_Since_signup', 'time_of_day', 'age', 'sex', 'device_id']

X = df[features]
y = df['class']


ohe = OneHotEncoder()
scaler = RobustScaler()

cat_cols = ['sex', 'device_id', 'source']
num_col = ['age', 'time_Since_signup', 'time_of_day']


preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, num_col),
        ('cat', ohe, cat_cols)
    ]
)

X_preprocessed = preprocessor.fit_transform(X)

In [24]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_preprocessed, y)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [32]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import KBinsDiscretizer

# Select numerical features to discretize
numerical_features = ['age', 'time_Since_signup', 'time_of_day']
categorical_features = list(set(X.columns) - set(numerical_features))

# Apply OneHotEncoder to categorical features and KBinsDiscretizer to numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform'), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=True), categorical_features)
    ]
)

# Fit and transform the features
X_preprocessed = preprocessor.fit_transform(X)

# Address class imbalance with SMOTE (convert sparse to dense only during resampling)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_preprocessed.toarray(), y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Convert the training and testing data into tuples for Q-learning
X_train_tuples = [tuple(state) for state in X_train]
X_test_tuples = [tuple(state) for state in X_test]


MemoryError: Unable to allocate 155. GiB for an array with shape (151112, 137964) and data type float64

In [26]:
from sklearn.preprocessing import KBinsDiscretizer

n_bins = 10
kbins = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
X_train_discrete = kbins.fit_transform(X_train)
X_test_discrete = kbins.transform(X_test)

X_train_tuples = [tuple(state) for state in X_train_discrete]
X_test_tuples = [tuple(state) for state in X_test_discrete]

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

In [19]:
import random
from collections import defaultdict
state_size = X_train.shape[1]

action_size = 2

Q = np.zeros((state_size * 10, action_size))

alpha = 0.1
gamma = 0.9
epsilon = 0.1
n_actions = 3

q_table = defaultdict(lambda: np.zeros(n_actions))
n_episodes = 10

for epsiode in range(n_episodes):
    state = random.

def reward(prediction, actual):
    if prediction == actual:
        return 1
    else:
        return -1
    

for epsiode in range(10):
    for i in range(len(X_train)):
        state = X_train[i]
        if random.uniform(0,1) < epsilon:
            action = random.randint(0, action_size - 1)
        else:
            action = np.argmax(Q[np.argmax(state)])

        prediction = action
        reward = reward(prediction, y_train[i])

        best_next_action = np.argmax(Q[np.argmax(state)])
        Q[np.argmax(state), action] += alpha * (reward + gamma * Q[np.argmax(state), best_next_action] - Q[np.argmax(state), action])

predictions = []

for i in range(len(X_test)):
    state = X_test[i]
    action = np.argmax(Q[np.argmax(state)])
    predictions.append(action)

TypeError: sparse array length is ambiguous; use getnnz() or shape[0]