# Random Forest Classification

## Importing the libraries

In [77]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [78]:
dataset = pd.read_csv('../data/processed/final_data.csv')
X = dataset.iloc[:, 2:-1].values    # Exclude the first two columns user_id and product_id
y = dataset.iloc[:, -1].values

## Encoding categorical data
### One Hot Encoding

In [79]:
ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(), [0, 1, 2, 3])], remainder='passthrough'
)
X = np.array(ct.fit_transform(X).toarray())

## Splitting the dataset into the Training set and Test set

In [80]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.25, random_state = 0
)

In [81]:
print(X_train)

[[ 0.   0.   0.  ... 58.   3.4  3. ]
 [ 0.   0.   0.  ... 44.   2.9  5. ]
 [ 0.   0.   0.  ... 44.   2.9  1. ]
 ...
 [ 0.   0.   0.  ... 44.   2.9  1. ]
 [ 0.   0.   0.  ... 44.   2.9  1. ]
 [ 0.   0.   0.  ... 20.   2.3  3. ]]


In [82]:
print(y_train)

[0 0 0 ... 0 0 0]


In [83]:
print(X_test)

[[ 0.   0.   0.  ... 44.   2.9  1. ]
 [ 0.   0.   0.  ... 54.   2.2  3. ]
 [ 0.   0.   0.  ... 44.   2.9  3. ]
 ...
 [ 0.   0.   0.  ... 37.   4.1  3. ]
 [ 0.   0.   0.  ... 61.   3.   3. ]
 [ 0.   0.   0.  ... 44.   2.9  3. ]]


In [84]:
print(y_test)

[0 1 0 ... 0 0 0]


# Feature Scaling

In [85]:
# sc = StandardScaler()
# X_train = sc.fit_transform(X_train)
# X_test = sc.transform(X_test)

## Training the Random Forest Classification model on the Training set

n_estimators : int ( means number of trees in the forest)
random_state : int ( means the seed used by the random number generator)

In [86]:
classifier = RandomForestClassifier(
    n_estimators=150,   # Number of trees in the forest
    max_depth=10,       # Maximum depth of the tree
    min_samples_split=2,    # The minimum number of samples required to split an internal node
    min_samples_leaf=1,     # The minimum number of samples required to be at a leaf node
    bootstrap=True,     # Whether bootstrap samples are used when building trees
    criterion='entropy',    # The function to measure the quality of a split
    random_state=0      # The seed used by the random number generator
)
classifier.fit(X_train, y_train)    # Fit the model 

## Predicting the Test set results

In [87]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[[0 0]
 [0 1]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


## Making the Confusion Matrix

In [91]:
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[1392    0]
 [ 355    0]]


0.7967945048654836

## Applying k-Fold Cross Validation