# Model to Predict  #**Is_fraudulent**

In [24]:
import pickle
import joblib
import numpy as np
import pandas as pd
import requests
from scipy.stats import skew
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (LabelEncoder, PowerTransformer,
                                   StandardScaler)

def engineer_features(df):
    df['send_amount'] = df['send_amount'].apply(lambda x: float(x)) # Convert send_amount to float, leaves space to add code that habndles ',' and '. of money.
    df['transaction_datetime'] = pd.to_datetime(df['transaction_datetime'])     
    # Extract hour, day of week, and month
    df['hour'] = df['transaction_datetime'].dt.hour
    df['dayofweek'] = df['transaction_datetime'].dt.dayofweek
    df['month'] = df['transaction_datetime'].dt.month

    # Group by currency and date    
    groups = df.groupby(['send_currency', pd.Grouper(key='transaction_datetime', freq='D')])

    for (currency, date), group in groups:
        if currency != 'USD':
            # Rates By Exchange Rate API at https://www.exchangerate-api.com
            response = requests.get(f'https://api.exchangerate-api.com/v4/latest/USD?base={currency}&symbols=USD&date={date.date()}')
            exchange_rate = response.json()['rates']['USD']

            # Convert currency
            mask = (df['send_currency'] == currency) & (df['transaction_datetime'].dt.date == date.date())
            df.loc[mask, 'send_amount'] *= exchange_rate
    df = df.rename(columns={'send_amount': 'usd_amount'})       
    df = df.drop('transaction_datetime', axis=1)
    return df


## Data Load and Preprocessing

In [23]:
df = engineer_features(pd.read_csv('transactions.csv') )
df

Unnamed: 0,usd_amount,send_currency,is_recent,is_valid,is_fraudulent_transaction,hour,dayofweek,month
0,55.158830,MXN,True,True,False,12,6,3
1,7492.170000,USD,False,True,False,9,5,2
2,21014.393400,EUR,False,False,True,17,2,2
3,77807.040000,USD,False,False,False,19,2,2
4,65899.716400,EUR,True,False,True,16,1,3
...,...,...,...,...,...,...,...,...
95,67282.520200,EUR,True,False,True,9,2,3
96,96530.938100,EUR,True,False,True,17,3,3
97,4076.964383,MXN,True,False,True,7,1,2
98,1140.280000,USD,False,True,True,12,4,2


## Preparing the data for model training

In [None]:
X = df.drop('is_fraudulent_transaction', axis=1)
y = df['is_fraudulent_transaction']

## Columns Transformations

In [None]:
# Encode categorical features to numeric
label_encoder = LabelEncoder()
X['send_currency'] = label_encoder.fit_transform(X['send_currency'])

float_cols = X.select_dtypes(include=['float']).columns

# Check for Skewness
skewness = X[float_cols].apply(lambda x: skew(x.dropna()))
skewed_features = skewness[abs(skewness) > 0.5].index
print("Skewed features:\n", skewed_features)

# Yeo-Johnson transformation to reduce skewness
#pt = PowerTransformer(method='yeo-johnson', standardize=False)
#X[skewed_features] = pt.fit_transform(X[skewed_features])

# save the PowerTransformer object to a file so that it can be used later
#with open('pt.pkl', 'wb') as f:
#    pickle.dump(pt, f)
# load the PowerTransformer object from a file
#with open('pt.pkl', 'rb') as f:
#    pt = pickle.load(f)
#new_data = [[10, 11, 12], [13, 14, 15]]
#transformed_data = pt.transform(new_data)


scaler = StandardScaler()
X[float_cols] = scaler.fit_transform(X[float_cols])

# Store the scaler variance and mean to a file so that it can be used later
std  = np.sqrt(scaler.var_)
np.save('../backend/api/ai_model/std_scaler.npy',std )
np.save('../backend/api/ai_model/mean_scaler.npy',scaler.mean_)

In [None]:
# Check the encoding order
label_encoder.classes_

## Split

In [None]:
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Baseline Model


In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Evaluate test set
print(f'Test set accuracy: {rf.score(X_test, y_test):.4f}')
print(f'Train set accuracy: {rf.score(X_train, y_train):.4f}')


# Store the model as a file
with open('../backend/api/ai_model/RFC_baseline.pkl', 'wb') as f:
    pickle.dump(rf, f)

# Load the model from the file
with open('../backend/api/ai_model/RFC_baseline.pkl', 'rb') as f:
    loaded_model = pickle.load(f)
    
data_point = [[0.168634,	0,	False,	False,	17,	2,	2]]
y_pred = loaded_model.predict(data_point)
y_pred

## Optimized Model

In [None]:
# Define the pipeline
pipeline = Pipeline([
    ('feature_selection', SelectKBest(f_classif)),
    ('classifier', RandomForestClassifier())
])

# Define the hyperparameters to search over
param_grid = {
    'feature_selection__k': [1, 5, 7],
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [5, 10, 1],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__bootstrap': [True, False]
}

# Use grid search to find the best hyperparameters
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Evaluate the best model on the test set
y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Test set accuracy: {accuracy:.4f}')
print(f'Train set accuracy: {grid_search.score(X_train, y_train):.4f}')

# Save the model to a file
joblib.dump(grid_search, '../backend/api/ai_model/rfc_opt.pkl')


In [None]:
data_point = [[-0.442300,	2,	False,	True,	9,	5,	2]]
y_pred = grid_search.predict(data_point)
y_pred