# Random Forest (Classification Tree technique)

In this notebook we will focus on performing a Supervised Learning algorithm known as "Random Forest" to build a predictive model.

There are a few steps to be followed to train a model for testing:

1) Decide on dataset to be used for training and read as a dataframe <br>
2) Pre-process dataset <br>
3) Dataset partitioning into training, test and validation datasets <br>
4) Train predictive model using Random Forest classifier <br>
5) Do prediction on test dataset <br>
6) Perform evaluation techniques (i.e. Confusion Matrix, F1 score etc) <br>
7) Make improvements

Note: Before closing, go to Cell > All Output > Clear to keep file size small.

Also make sure this jupyter notebook file is opened using the following command:

```jupyter notebook --NotebookApp.iopub_data_rate_limit=10000000000```

In [15]:
#Importing of general libraries needed

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import gensim, datetime, time
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem.porter import *

#Libraries for data pre-processing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import preprocessing

#For Random Forest implementation
from sklearn.ensemble import RandomForestClassifier

def printModelAccuracy(y_test, y_pred):
    # Find the confusion matrix of the result
    cm = pd.DataFrame(confusion_matrix(y_test, y_pred, labels=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]), \
        index=['true:0', 'true:1', 'true:2', 'true:3', 'true:4', 'true:5', 'true:6', 'true:7', 'true:8', 'true:9', 'true:10', 'true:11'], 
        columns=['pred:0', 'pred:1', 'pred:2', 'pred:3', 'pred:4', 'pred:5', 'pred:6', 'pred:7', 'pred:8', 'pred:9', 'pred:10', 'pred:11'])
    print("Confusion Matrix:")
    print(cm)

    # Find the accuracy and F1 score of the result
    asr = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    print("Accuracy:", asr)
    print("F1:", f1)

In [16]:
#Read & import dataset as dataframe
dataset_df = pd.read_csv('train.csv', header=0)

#Display first 5 rows
dataset_df.head(5)

Unnamed: 0,itemid,title,Category,image_path
0,307504,nyx sex bomb pallete natural palette,0,beauty_image/6b2e9cbb279ac95703348368aa65da09.jpg
1,461203,etude house precious mineral any cushion pearl...,1,beauty_image/20450222d857c9571ba8fa23bdedc8c9.jpg
2,3592295,milani rose powder blush,2,beauty_image/6a5962bed605a3dd6604ca3a4278a4f9.jpg
3,4460167,etude house baby sweet sugar powder,3,beauty_image/56987ae186e8a8e71fcc5a261ca485da.jpg
4,5853995,bedak revlon color stay aqua mineral make up,3,beauty_image/9c6968066ebab57588c2f757a240d8b9.jpg


In [17]:
#Data pre-processing
#Takes around 4 minutes to run this block of code
corpus = dataset_df["title"]
labels = dataset_df["Category"]

start = time.time()
stop_list = nltk.corpus.stopwords.words('english')
stemmer = nltk.stem.porter.PorterStemmer()
def corpus2docs(corpus):
    # corpus is a object returned by load_corpus that represents a corpus.
    docs1 = []

    for title in corpus:
        doc = nltk.word_tokenize(title)
        docs1.append(doc)
    docs2 = [[w.lower() for w in doc] for doc in docs1] #lower case the words
    docs3 = [[w for w in doc if re.search('^[a-z]+$', w)] for doc in docs2] #removing special characters and numbers
    docs4 = [[w for w in doc if w not in stop_list] for doc in docs3] #removing words in stop list
    #changing list into a string
    docs5 = [' '.join([stemmer.stem(w) for w in doc]) for doc in docs4] #changing the words into its root form
    
    return docs5

docs = corpus2docs(corpus)
end = time. time()
print(end - start)
print(docs[0:10])

210.69733667373657
['nyx sex bomb pallet natur palett', 'etud hous preciou miner cushion pearl aura puff', 'milani rose powder blush', 'etud hous babi sweet sugar powder', 'bedak revlon color stay aqua miner make', 'dr pure whiten cream', 'chanel powder blush malic', 'snail white cream origin', 'sunpris proof spf', 'eyebrow powder nyx satuan rp pc']


In [18]:
#Data pre-processing - vectorise and TF-IDF
start = time.time()

def convertToDataframe(listofwords, labels):
    vectorizer = TfidfVectorizer(analyzer='word') #tfidf
    words_tfidf = vectorizer.fit_transform(listofwords) #tfidf

    tablecolumns = []                      
    tablecolumns.append(vectorizer.get_feature_names()) #adding column headers

    df = pd.DataFrame(words_tfidf.toarray(), columns=tablecolumns) #creating dataframe

    df['Category'] = labels
                      
    return df


df = convertToDataframe(docs[0:50000], labels.values.tolist()[0:50000]) #limit for 50k rows
print(df.head(10))

end = time. time()
print(end - start)

    aa  aaa  aag  aaw   ab abadi abal  abc  abh abil   ...      zs  zsc   zt  \
0  0.0  0.0  0.0  0.0  0.0   0.0  0.0  0.0  0.0  0.0   ...     0.0  0.0  0.0   
1  0.0  0.0  0.0  0.0  0.0   0.0  0.0  0.0  0.0  0.0   ...     0.0  0.0  0.0   
2  0.0  0.0  0.0  0.0  0.0   0.0  0.0  0.0  0.0  0.0   ...     0.0  0.0  0.0   
3  0.0  0.0  0.0  0.0  0.0   0.0  0.0  0.0  0.0  0.0   ...     0.0  0.0  0.0   
4  0.0  0.0  0.0  0.0  0.0   0.0  0.0  0.0  0.0  0.0   ...     0.0  0.0  0.0   
5  0.0  0.0  0.0  0.0  0.0   0.0  0.0  0.0  0.0  0.0   ...     0.0  0.0  0.0   
6  0.0  0.0  0.0  0.0  0.0   0.0  0.0  0.0  0.0  0.0   ...     0.0  0.0  0.0   
7  0.0  0.0  0.0  0.0  0.0   0.0  0.0  0.0  0.0  0.0   ...     0.0  0.0  0.0   
8  0.0  0.0  0.0  0.0  0.0   0.0  0.0  0.0  0.0  0.0   ...     0.0  0.0  0.0   
9  0.0  0.0  0.0  0.0  0.0   0.0  0.0  0.0  0.0  0.0   ...     0.0  0.0  0.0   

   ztv  zvf zwain zwgb zwitsal zxcv Category  
0  0.0  0.0   0.0  0.0     0.0  0.0        0  
1  0.0  0.0   0.0  0.0   

In [19]:
#Data partitioning into train, test and validation set
X = df.loc[:, df.columns != 'Category'] #take everything except Category
y = df[['Category']] #our label is Category

#Split dataset into two groups - the training set and the test set
#Test size set to 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [21]:
#Build, train and test a model using Random Forest classifier

#Instantiate model with 500 decision trees
rf = RandomForestClassifier(n_estimators = 500, random_state = 0, max_depth = 2)

#Fit the training feature Xs and training label Ys
rf.fit(X_train, y_train.values.ravel())

#Use the trained model to predict the test data
y_pred = rf.predict(X_test)

# Find the confusion matrix, the accuracy, and F1 score of the result
printModelAccuracy(y_test, y_pred)

Confusion Matrix:
         pred:0  pred:1  pred:2  pred:3  pred:4  pred:5  pred:6  pred:7  \
true:0        0       0       0     180       0       0       0       0   
true:1        0       0       0    1051       0       0       0       0   
true:2        0       0       0     448       0       0       0       0   
true:3        0       0       0    3158       0       0       0       0   
true:4        0       0       0    1654       0       0       0       0   
true:5        0       0       0    2150       0       0       0       0   
true:6        0       0       0      82       0       0       0       0   
true:7        0       0       0     464       0       0       0       0   
true:8        0       0       0     239       0       0       0       0   
true:9        0       0       0     361       0       0       0       0   
true:10       0       0       0      47       0       0       0       0   
true:11       0       0       0     166       0       0       0       0   

      

  'precision', 'predicted', average, warn_for)


In [None]:
# Alternative method:
# Find the confusion matrix of the result
#cm = confusion_matrix(y_test, y_pred)
#print(cm)

# Find the accuracy and F1 score of the result
#asr = accuracy_score(y_test, y_pred)
#f1 = f1_score(y_test, y_pred)
#print("Accuracy:", asr)
#print("F1:", f1)