## Imports

In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

## Load data

In [20]:
train = pd.read_csv("training.csv")
test = pd.read_csv("test.csv")

train.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0


## Preprocess data

In [21]:
def preprocess_data(df):
    # Remove any empty columns
    df = df.dropna(axis=1, how='all')
    
    # Feature engineering
    df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])
    df['hour'] = df['TransactionStartTime'].dt.hour
    df['day'] = df['TransactionStartTime'].dt.day
    df['month'] = df['TransactionStartTime'].dt.month
    df['weekday'] = df['TransactionStartTime'].dt.weekday
    
    # Drop unnecessary columns
    df = df.drop(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 'CurrencyCode', 'CountryCode', 'TransactionStartTime'], axis=1)
    
    # Label encode categorical variables
    le = LabelEncoder()
    for col in ['ProviderId', 'ProductId', 'ProductCategory', 'ChannelId']:
        df[col] = le.fit_transform(df[col])
    
    return df


train = preprocess_data(train)
test = preprocess_data(test)

train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])


Unnamed: 0,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,PricingStrategy,FraudResult,hour,day,month,weekday
0,5,1,0,2,1000.0,1000,2,0,2,15,11,3
1,3,19,2,1,-20.0,20,2,0,2,15,11,3
2,5,0,0,2,500.0,500,2,0,2,15,11,3
3,0,11,8,2,20000.0,21800,2,0,3,15,11,3
4,3,19,2,1,-644.0,644,2,0,3,15,11,3


In [22]:
X = train.drop(['FraudResult'], axis=1)
y = train['FraudResult']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Oversampling using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Feature scaling
sc = StandardScaler()
X_train_resampled = sc.fit_transform(X_train_resampled)
X_val = sc.transform(X_val)
test = sc.transform(test)

X_train.head()

Unnamed: 0,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,PricingStrategy,hour,day,month,weekday
59096,3,19,2,1,-100.0,100,2,8,13,1,6
35991,5,1,0,2,5000.0,5000,2,16,21,12,4
69457,5,16,0,2,1000.0,1000,2,18,23,1,2
58498,3,19,2,1,-70.0,70,2,11,12,1,5
88790,5,1,0,2,20000.0,20000,2,7,8,2,4


## Train the models

### Logistic Regression

In [23]:
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_resampled, y_train_resampled)
y_pred_log_reg = log_reg.predict(X_val)

# Evaluate the model
print("Logistic Regression Results:")
print(classification_report(y_val, y_pred_log_reg))

Logistic Regression Results:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     19097
           1       0.14      1.00      0.25        36

    accuracy                           0.99     19133
   macro avg       0.57      0.99      0.62     19133
weighted avg       1.00      0.99      0.99     19133



### Random Forest

In [24]:
random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X_train_resampled, y_train_resampled)
y_pred_val = random_forest.predict(X_val)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_val, y_pred_val))

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19097
           1       0.90      1.00      0.95        36

    accuracy                           1.00     19133
   macro avg       0.95      1.00      0.97     19133
weighted avg       1.00      1.00      1.00     19133



### XGBoost

In [25]:
xgb = XGBClassifier(random_state=42)
xgb.fit(X_train_resampled, y_train_resampled)
y_pred_xgb = xgb.predict(X_val)

# Evaluate the model
print("XGBoost Results:")
print(classification_report(y_val, y_pred_xgb))

XGBoost Results:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19097
           1       0.82      1.00      0.90        36

    accuracy                           1.00     19133
   macro avg       0.91      1.00      0.95     19133
weighted avg       1.00      1.00      1.00     19133



In [26]:
# Make predictions on the test set using the Random Forest model
y_pred_test = random_forest.predict(test)

## Create a submission file

In [27]:
submission = pd.read_csv("test.csv")[['TransactionId']]
submission['FraudResult'] = y_pred_test

submission.to_csv("submission.csv", index=False)