<a href="https://colab.research.google.com/github/ChelseaGuan/COVID-19-Database-System/blob/main/Lab1_2021_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CS4035 - Cyber Data Analytics
## Lab 1 - Fraud data

## Group Number : 8

## Student 1 
### Name : Nicolas Perez
### ID : 5081270

## Student 2
### Name : Chelsea Guan
### ID : 5695481

## 4. Classification task – 2 A4

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Data processing

In [2]:
import numpy as np
import pandas as pd

# Fill missing values and adjust currencies for both the training set and the test sets
# Note that part of this code was taken from the W1_Visuals notebook on Brightspace
def clean(data: pd.DataFrame) -> pd.DataFrame:
    data = data.copy()
    
    # Filing missing values
    data.loc[data['cardverificationcodesupplied'].isna(),'cardverificationcodesupplied'] = False
    data.loc[data['issuercountrycode'].isna(),'issuercountrycode'] = 'ZZ'
    data.loc[data['shoppercountrycode'].isna(),'shoppercountrycode'] = 'ZZ'

    # Standardize transaction amounts into euros
    def conv(row):
        currency_dict = {"BGN": 1.9558, "NZD": 1.6805, "ILS": 4.0448, "RUB": 72.2099, "CAD": 1.5075, "USD": 1.1218,
                         "PHP": 58.125, "CHF": 1.1437, "ZAR": 16.0224, "AUD": 1.5911, "JPY": 124.93, "TRY": 6.6913,
                         "HKD": 8.8007, "MYR": 4.6314, "THB": 35.802, "HRK": 7.413, "NOK": 9.6678, "IDR": 15953.68,
                         "DKK": 7.4646, "CZK": 25.659, "HUF": 322.97, "GBP": 0.86248, "MXN": 21.2829, "KRW": 1308.01,
                         "ISK": 136.2, "SGD": 1.5263, "BRL": 4.405, "PLN": 4.2868, "INR": 78.0615, "RON": 4.7596,
                         "CNY": 7.5541, "SEK": 10.635}
        return row['amount'] / (currency_dict[row['currencycode']]*100)

    data['amount'] = data.apply(lambda x: conv(x), axis=1)

    # Simple feature engineering
    data['countries_equal'] = (data['shoppercountrycode'] == data['issuercountrycode'])
    data.loc[data['countries_equal'] == False, 'countries_equal'] = 0
    data.loc[data['countries_equal'] == True, 'countries_equal'] = 1
    
    return data


In [3]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from typing import Tuple

def encode(data: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    data = data.copy()

    # Normalize numerical data.
    numerical_cols = ['amount']
    scaler = StandardScaler()
    data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

    # Label encode categorical data with too many unique values
    le = LabelEncoder()
    data['bin'] = le.fit_transform(data['bin'])

    # One hot encode other categorical data with less unique values
    categorical_cols = ['issuercountrycode', 'txvariantcode', 'currencycode', 'shoppercountrycode', 'shopperinteraction', 'accountcode']
    ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
    ohe.fit(data[categorical_cols])
    cat_feature_names = ohe.get_feature_names_out(categorical_cols)

    one_hot_encoded = ohe.fit_transform(data[categorical_cols])
    data = pd.concat([data.drop(categorical_cols, axis=1), pd.DataFrame(one_hot_encoded, columns=cat_feature_names, index=data.index)], axis=1)

    return data

## Training models (10 fold cross validation)
Takes about 15 minutes to run

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE

# Prepare data
train_path = '/content/drive/My Drive/Cyber Data Analytics/data/lab_1/train_data.csv'
data = pd.read_csv(train_path)
data = data.drop(['Id', 'mail_id', 'ip_id', 'card_id'], axis=1)
data = clean(data)
data = encode(data)

# Split the data into features and labels
X = data.drop('label', axis=1)
y = data['label']

tss = TimeSeriesSplit(n_splits=10)

for i, (train_index, test_index) in enumerate(tss.split(X)):
    print(f'Fold {i}:')
    train_data = data.iloc[train_index]
    test_data = data.iloc[test_index]

    X_train = train_data.drop('label', axis=1)
    y_train = train_data['label']
    X_test = test_data.drop('label', axis=1)
    y_test = test_data['label']

    # Apply SMOTE to the train set
    smote = SMOTE(random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
    print('Done SMOTEing')

    # Train white-box model
    wb = DecisionTreeClassifier(
               max_depth=100,
               max_features='sqrt',
               min_samples_leaf=4,
               min_samples_split=10,
               random_state=42)
    wb.fit(X_train_res, y_train_res)
    print('Done Fitting Classifier')
    y_pred_proba_wb = wb.predict_proba(X_test)[:,1]
    auc_wb = roc_auc_score(y_test, y_pred_proba_wb)
    print('White-box AUC: ', auc_wb)
    
    # Train black-box model
    bb = RandomForestClassifier(
               max_depth=100,
               max_features='sqrt',
               min_samples_leaf=4,
               min_samples_split=10,
               n_estimators=500,
               random_state=42, n_jobs=-1)
    bb.fit(X_train_res, y_train_res)
    y_pred_proba_bb = bb.predict_proba(X_test)[:,1]
    auc_bb = roc_auc_score(y_test, y_pred_proba_bb)
    print('Black-box AUC: ', auc_bb, "\n")
    


Fold 0:
Done SMOTEing
Done Fitting Classifier
0.7675180044139854
0.9397970147520036 

Fold 1:
Done SMOTEing
Done Fitting Classifier
0.9596323674002037
0.9558536977779366 

Fold 2:
Done SMOTEing
Done Fitting Classifier
0.8575820600708766
0.9169592749665949 

Fold 3:
Done SMOTEing
Done Fitting Classifier
0.8755742524691819
0.9400869019269437 

Fold 4:
Done SMOTEing
Done Fitting Classifier
0.8351172914824069
0.9364209366533462 

Fold 5:
Done SMOTEing
Done Fitting Classifier
0.8024239450652791
0.9519019920507341 

Fold 6:
Done SMOTEing
Done Fitting Classifier
0.8058835770138559
0.8958235301837023 

Fold 7:
Done SMOTEing
Done Fitting Classifier
0.8274422131665512
0.9333491598834529 

Fold 8:
Done SMOTEing
Done Fitting Classifier
0.7256252112200068
0.9549256505576208 

Fold 9:
Done SMOTEing
Done Fitting Classifier
0.9092907005845654
0.9591184147017286 



## Improving AUC
This part can be deleted later, for now, I'm using it to tweak the parameters in RandomForestClassifier and DecisionTreeClassifier 

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import roc_auc_score
# from imblearn.over_sampling import SMOTE

# data_path = '/content/drive/My Drive/Cyber Data Analytics/data/lab_1/train_data.csv'
# data = pd.read_csv(data_path)
# data = data.drop(['Id', 'mail_id', 'ip_id', 'card_id'], axis=1)

# data = clean(data)
# data = encode(data)

# # Split the data into features and labels
# X = data.drop('label', axis=1)
# y = data['label']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Apply SMOTE to the train set
# smote = SMOTE(random_state=42)
# X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# clf = DecisionTreeClassifier(
#                max_depth=100,
#                max_features='sqrt',
#                min_samples_leaf=4,
#                min_samples_split=10,
#                random_state=42)
# clf.fit(X_train_res, y_train_res)
# y_pred_proba = clf.predict_proba(X_test)[:,1]
# auc = roc_auc_score(y_test, y_pred_proba)
# print(auc, "\n")



### 4a. Print relevant plots and metrics with clear headings.

### 4b. Explain the applied data pre-processing steps, learning algorithms, and post-processing steps or ensemble methods. Compare the performance of the two algorithms, focusing on performance criteria that are relevant in practice, use 10-fold cross-validation.

## 5. Bonus task – 1 A4

The simplest aggregate feature is from txvariantcode, distingushing debit from credit cards. Maybe some extra features for visa, as those cards have a bunch more types.

Another aggregate feature is if the shopper country is the same as the issuer country

### 5a. Provide code and report below

## Kaggle

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.model_selection import StratifiedKFold
# from imblearn.over_sampling import SMOTE

# train_path = '/content/drive/My Drive/Cyber Data Analytics/data/lab_1/train_data.csv'
# train_data = pd.read_csv(train_path)
# test_path = '/content/drive/My Drive/Cyber Data Analytics/data/lab_1/test_data.csv'
# test_data = pd.read_csv(test_path)
# id_test = test_data['Id']

# data = pd.concat([train_data, test_data])
# data = data.drop(['Id', 'mail_id', 'ip_id', 'card_id'], axis=1)

# data = clean(data)
# data = encode(data)

# train_data = data.iloc[:len(train_data), :]
# test_data = data.iloc[len(train_data):, :]

# # Split the data into features and labels
# X_train = train_data.drop('label', axis=1)
# y_train = train_data['label']
# X_test = test_data.drop('label', axis=1)

# # Apply SMOTE to the train set
# smote = SMOTE(random_state=42)
# X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# bb = RandomForestClassifier(
#                max_depth=100,
#                max_features='sqrt',
#                min_samples_leaf=4,
#                min_samples_split=10,
#                n_estimators=500,
#                random_state=42, n_jobs=-1)
# bb.fit(X_train_res, y_train_res)
# y_pred_proba_bb = bb.predict_proba(X_test)[:,1]

# output = pd.concat([id_test, pd.DataFrame({'Predicted': y_pred_proba_bb})], axis=1)
# output.to_csv('kaggle.csv', index=False)