In [0]:
# Importing libraries for building the neural network
import tensorflow as tf
import pandas as pd
import io
import numpy as np
from sklearn.model_selection import train_test_split,StratifiedKFold,cross_val_score,RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report, confusion_matrix
from google.colab import files
uploaded = files.upload()

In [0]:
# read the data
data = pd.read_csv(io.BytesIO(uploaded['Trainset.csv'])).dropna()
test = pd.read_csv(io.BytesIO(uploaded['xtest.csv'])).dropna()

In [0]:
# break the data and remove unnecessary colums

prediction_var = ['Homepage','Homepage _Duration', 'Aboutus', 'Aboutus_Duration', 'Contactus', 'Contactus_Duration',
           'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'Month', 'OperatingSystems',
           'Browser', 'Province', 'TrafficType','VisitorType','Weekend']

X = data[prediction_var]
Y = data.Revenue
test_id= test['ID']
del test['ID']




In [0]:
# clean the data
encoder = LabelEncoder()
encoder.fit(X['Month'])
(X['Month']) = encoder.transform(X['Month'])
encoder.fit(X['Weekend'])
(X['Weekend']) = encoder.transform(X['Weekend'])
encoder.fit(X['VisitorType'])
(X['VisitorType']) = encoder.transform(X['VisitorType'])


encoder.fit(test['Month'])
(test['Month']) = encoder.transform(test['Month'])
encoder.fit(test['Weekend'])
(test['Weekend']) = encoder.transform(test['Weekend'])
encoder.fit(test['VisitorType'])
(test['VisitorType']) = encoder.transform(test['VisitorType'])

encoder.fit(Y)
encoded_Y = encoder.transform(Y)

In [0]:
# converted all the columns to float
X=X.astype(float)
test=test.astype(float)
Y=Y.astype(float)


In [0]:
# standardized the data
sc = StandardScaler()
X = sc.fit_transform(X)
test = sc.fit_transform(test)


In [0]:
# break the training data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=66)

In [0]:

# random forest model creation
rfc = RandomForestClassifier(n_estimators=600,  max_features='sqrt',max_depth: 380)
rfc.fit(X_train,y_train)

# predictions
rfc_predict = rfc.predict(X_test)



In [0]:

rfc_cv_score = cross_val_score(rfc, X, Y, cv=10, scoring='roc_auc')

In [14]:
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())

=== Confusion Matrix ===
[[2792  128]
 [ 211  323]]


=== Classification Report ===
              precision    recall  f1-score   support

         0.0       0.93      0.96      0.94      2920
         1.0       0.72      0.60      0.66       534

    accuracy                           0.90      3454
   macro avg       0.82      0.78      0.80      3454
weighted avg       0.90      0.90      0.90      3454



=== All AUC Scores ===
[0.93665384 0.92078611 0.92928437 0.9300272  0.92825122 0.90715253
 0.93242696 0.93604408 0.92574088 0.93695883]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.9283326014591932


In [17]:
# Tuning Hyperparameters

# number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# number of features at every split
max_features = ['auto', 'sqrt']

# max depth
max_depth = [int(x) for x in np.linspace(100, 500, num = 11)]
max_depth.append(None)
# create random grid
random_grid = {
 'n_estimators': n_estimators,
 'max_features': max_features,
 'max_depth': max_depth
 }
# Random search of parameters
rfc_random = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=1, random_state=42, n_jobs = -1)
# Fit the model
rfc_random.fit(X_train, y_train)
# print results
print(rfc_random.best_params_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed: 17.4min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 25.9min finished


{'n_estimators': 600, 'max_features': 'sqrt', 'max_depth': 380}


In [0]:
# predicting the test values and downloading the csv file
Y_test= (rfc.predict(test)>0.5)
ID = pd.DataFrame(test_id, columns = ['ID']) 
REV = pd.DataFrame(Y_test, columns = ['Revenue']) 
COLS = [ID,REV]
result = pd.concat(COLS, axis=1)

encoder.fit(result['Revenue'])
(result['Revenue']) = encoder.transform(result['Revenue'])
result.columns
result.to_csv('result.csv') 
files.download("result.csv")