<a href="https://colab.research.google.com/github/Francorider/Classification_Hackathon/blob/master/Classification_Hackathon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Honour Code
I, FRANCOIS VILJOEN, confirm - by submitting my - that the solutions in this notebook are a result of my own work and that I abide by the EDSA honour code (https://drive.google.com/file/d/1QDCjGZJ8-FmJE3bZdIQNwnJyQKPhHZBn/view?usp=sharing).

Non-compliance with the honour code constitutes a material breach of contract.

In [0]:
# IMPORT MODULES
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, cohen_kappa_score
from sklearn.metrics import f1_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV

In [0]:
# IMPORT DATA
train = pd.read_csv('training.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [0]:
# DROP COLUMNS THAT APPEAR IRRELEVANT
train_dropped = train.drop(['BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 'CurrencyCode', 'CountryCode', 'TransactionStartTime'], axis = 1)

test_dropped = test.drop(['BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 'CurrencyCode', 'CountryCode', 'TransactionStartTime'], axis = 1)

In [0]:
# MAKE SUBSETS OF DATA

train_sub = train_dropped.copy()

test_sub = test_dropped.copy()

In [0]:
# ENCODE NUMERICAL COLUMNS TO INTEGERS FOR TRAINING DATA
train_sub['TransactionId'] = train_sub['TransactionId'].str.split('_').str[1]

train_sub['ProviderId'] = train_sub['ProviderId'].str.split('_').str[1]

train_sub['ProductId'] = train_sub['ProductId'].str.split('_').str[1]

train_sub['ChannelId'] = train_sub['ChannelId'].str.split('_').str[1]

# ENCODE NUMERICAL COLUMNS TO INTEGERS FOR TEST DATA
test_sub['TransactionId'] = test_sub['TransactionId'].str.split('_').str[1]

test_sub['ProviderId'] = test_sub['ProviderId'].str.split('_').str[1]

test_sub['ProductId'] = test_sub['ProductId'].str.split('_').str[1]

test_sub['ChannelId'] = test_sub['ChannelId'].str.split('_').str[1]

In [0]:
# SPLIT TRAINING DATA INTO X AND Y
train_sub_x = train_sub.drop(['FraudResult'], axis = 1)
train_sub_y = train_sub['FraudResult']

In [0]:
# CONCATENATE TRAINING AND TEST DATA TO ENCODE
train_sub_x['train'] = 1
test_sub['train'] = 0

combined = pd.concat([train_sub_x, test_sub])

In [0]:
# ENCODE CATEGORICAL COLUMNS
combined = pd.get_dummies(combined, columns = ["ProductCategory", "ProviderId", "ProductId", "ChannelId", "PricingStrategy"])

In [0]:
# SPLIT BACK INTO TRAINING AND TEST DATA
train_sub_x = combined[combined['train'] == 1]
train_sub_x.drop(['train'], axis = 1, inplace = True)

test_sub = combined[combined['train'] == 0]
test_sub.drop(['train'], axis = 1, inplace = True)

In [0]:
# SAMPLING

temp = train_sub_x.copy()
temp['FraudResult'] = train_sub_y

majority = temp[temp['FraudResult'] == 0]
minority = temp[temp['FraudResult'] == 1]

# UPSAMPLE MINORITY CLASS
minority_upsampled = resample(minority, replace = True, n_samples = len(majority))

# COMBINE MAJORITY WITH UPSAMPLED MINORITY
upsampled = pd.concat([majority, minority_upsampled])

In [0]:
# SPLIT UPSAMPLED DATA INTO X AND Y
train_sub_x = upsampled.drop(['FraudResult'], axis = 1)
train_sub_y = upsampled['FraudResult']

In [0]:
# COUNTS
train_sub_y.value_counts()

1    95469
0    95469
Name: FraudResult, dtype: int64

In [0]:
# CREATE RANDOM FOREST MODEL
rf = RandomForestClassifier(n_estimators = 200, criterion = 'entropy')

# FIT DATA
rf.fit(train_sub_x, train_sub_y)

# PREDICT
y_pred = rf.predict(test_sub)

In [0]:
# COMPILE SUBMISSION FILE
submission_id = 'TransactionId_' + test_sub['TransactionId'].values
submission = pd.DataFrame(list(zip(submission_id, y_pred)), columns =['TransactionId', 'FraudResult']) 
submission.to_csv('submission.csv', index=False)