In [1]:
import pandas as pd
from sklearn import model_selection, ensemble, metrics

In [2]:
df_train_binary_encoding = pd.read_csv('../../Feature_Encoding/data/train_binary_encoding.csv')
df_test_binary_encoding = pd.read_csv('../../Feature_Encoding/data/test_binary_encoding.csv')
train = pd.read_csv('../../Feature_Engineering/data/other-cleaned_train.csv')

In [3]:
def cross_val(model, x_train, y_train):
    score_cross_val = model_selection.cross_val_score(model, x_train, y_train, cv=5)
    print(score_cross_val.mean())

In [4]:
def test_model(model, x_test, y_test):
    predictions = model.predict_proba(x_test)[:,1]
    logloss = metrics.log_loss(y_test, predictions)
    accuracy = metrics.accuracy_score(y_test, predictions.round())
    print("Accuracy: %.2f%%, Logloss: %.2f" % (accuracy*100.0, logloss))

In [5]:
def RandomForest_con_gridsearch(x_train, y_train, x_validation, y_validation):

  rf = ensemble.RandomForestClassifier()
  params_rf = {'n_estimators': [50, 100, 150],'max_depth':[3,10,6,15]}
  rf_gs = model_selection.GridSearchCV(rf, params_rf, cv=5)
  rf_gs.fit(x_train, y_train)
  rf_best = rf_gs.best_estimator_

  print(rf_gs.best_params_)
  print('rf: {}'.format(rf_best.score(x_validation, y_validation)))

  return rf_best

In [6]:
y = train.Target
x_train_binary_encoding, x_validation_binary_encoding, y_train_binary_encoding, y_validation_binary_encoding = model_selection.train_test_split(df_train_binary_encoding, y, test_size=0.3, stratify=y)

## Binary Encoding

In [7]:
rf_binary = RandomForest_con_gridsearch(x_train_binary_encoding, y_train_binary_encoding, x_validation_binary_encoding, y_validation_binary_encoding)

{'max_depth': 15, 'n_estimators': 100}
rf: 0.910167818361303


In [8]:
test_model(rf_binary,x_validation_binary_encoding,y_validation_binary_encoding)

Accuracy: 91.02%, Logloss: 0.24


In [9]:
cross_val(rf_binary, x_train_binary_encoding, y_train_binary_encoding)

0.902775408815975


In [10]:
y_pred = rf_binary.predict_proba(df_test_binary_encoding)[:,1]
submission_rf_binary = pd.DataFrame(data={'Opportunity_ID':df_test_binary_encoding['Opportunity_ID'], 'Target': y_pred})
submission_rf_binary = submission_rf_binary.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_rf_binary.to_csv('../submits/binary_rf_2.csv', index=False)