In [128]:
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.metrics import precision_recall_fscore_support

In [3]:
data = pd.read_csv("dataset.csv")
data.shape

(6362620, 11)

In [78]:
payment_types = data['type'].unique().tolist()
Y = data['isFraud']
X = data.drop(['isFraud', 'isFlaggedFraud', 'nameOrig', 'nameDest'], axis=1)
X['type'] = X['type'].apply(lambda x: payment_types.index(x))
print(X.shape)
print(Y.shape)

(6362620, 7)
(6362620,)


In [79]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(5090096, 7)
(1272524, 7)
(5090096,)
(1272524,)


In [93]:
k = 50
n = X.shape[1]
feature_list = X.iloc[0].keys()
m = int(n ** 0.5) + 1
j = 2

In [96]:
tranformed_X_train = []
tranformed_X_test = []
pca_models = []

rf = RandomForestClassifier(n_estimators=k)
for i in range(rf.n_estimators):
    pca = PCA(n_components=j)
    features = np.sort(np.random.choice(feature_list, size=m, replace=False))

    new_X_train = X_train[features]
    tranformed_X_train.append(pca.fit_transform(new_X_train))

    new_X_test = X_test[features]
    tranformed_X_test.append(pca.fit_transform(new_X_test))

    pca_models.append(pca)



In [132]:
# Dummy training to populate the estimators
rf.fit(X_train.head(10), Y_train.head(10))

In [112]:
for i, tree in enumerate(rf.estimators_):
    tree.fit(tranformed_X_train[i], Y_train)

In [125]:
predictions = [tree.predict(tranformed_X_test[i]) for i, tree in enumerate(rf.estimators_)]

rf_predictions = stats.mode(predictions, axis=0, keepdims=False)[0]
rf_predictions.shape

(1272524,)

In [131]:
result = precision_recall_fscore_support(Y_test, rf_predictions)
result
#               Class 0     Class 1
# Precision
# Recall
# Precision
# Num instances

(array([0.99887137, 1.        ]),
 array([1.        , 0.11358025]),
 array([0.99943537, 0.20399113]),
 array([1270904,    1620]))