In [73]:
from src.data_processing import load_data, DataProcessing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score
import numpy as np
import cPickle

In [41]:
raw_data = load_data("data/data.json")

In [42]:
train, test = train_test_split(raw_data, test_size = 0.2)
pre_process = DataProcessing("train", train)
pre_process.fit()
pre_processed_df = pre_process.df

In [47]:
y = pre_processed_df["fraud"]
X = pre_processed_df.drop(["fraud"], axis = 1)

In [62]:
rf_grid = { 'n_estimators': [150],
                     'max_depth': [30],
                     'max_features': [None],
                     'random_state': [1]}

rf_gridsearch = GridSearchCV(RandomForestClassifier(),
                            rf_grid,
                            n_jobs=-1,
                            scoring='recall')
rf_gridsearch.fit(X, y)

print("best parameters:", rf_gridsearch.best_params_)
best_gdbr_model = rf_gridsearch.best_estimator_

('best parameters:', {'max_features': None, 'n_estimators': 150, 'random_state': 1, 'max_depth': 30})


In [71]:
best_rf = RandomForestClassifier(max_features=None, n_estimators=150, max_depth=30)
np.mean(cross_val_score(best_rf, X,y, scoring="precision", cv = 5, n_jobs=-1))

0.92819000989022427

In [67]:
pre_process_test = DataProcessing("test", test)
pre_process_test.fit()
y_test = pre_process_test.df["fraud"]
X_test = pre_process_test.df.drop(["fraud"], axis = 1)

In [68]:
best_rf.fit(X,y)
best_rf.score(X_test,y_test)

0.94630404463040452

In [70]:
recall_score(y_test, best_rf.predict(X_test))

0.64550264550264547

In [72]:
pre_process_all = DataProcessing("train", raw_data)
pre_process_all.fit()
X_all = pre_process_all.df.drop(["fraud"], axis = 1)
y_all = pre_process_all.df["fraud"]
best_rf = RandomForestClassifier(max_features=None, n_estimators=150, max_depth=30)
best_rf.fit(X_all,y_all)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=30, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [74]:
with open('data/rf_1.pkl', 'wb') as f:
    cPickle.dump(best_rf, f)