# Random Forest (Classification Tree technique)

In this notebook we will focus on performing a Supervised Learning algorithm known as "Random Forest" to build a predictive model.

There are a few steps to be followed to train a model for testing:

1) Decide on dataset to be used for training and read as a dataframe <br>
2) Pre-process dataset <br>
3) Dataset partitioning into training, test and validation datasets <br>
4) Train predictive model using Random Forest classifier <br>
5) Do prediction on test dataset <br>
6) Perform evaluation techniques (i.e. Confusion Matrix, F1 score etc) <br>
7) Make improvements

Note: Before closing, go to Cell > All Output > Clear to keep file size small.

Also make sure this jupyter notebook file is opened using the following command:

```jupyter notebook --NotebookApp.iopub_data_rate_limit=10000000000```

In [1]:
import re, io, gensim, datetime, time, nltk, random, pickle
from gensim import corpora
from sklearn.feature_extraction.text import TfidfVectorizer

#General libraries needed
import numpy as np
import pandas as pd

#Libraries for data pre-processing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import preprocessing

#For Decision Tree implementation
from scipy.stats import entropy
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

#For Bagging implementation
from sklearn.ensemble import BaggingClassifier

#For AdaBoost implementation
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

#For Random Forest implementation
from sklearn.ensemble import RandomForestClassifier

#For Logistic Regression
from sklearn.linear_model import LogisticRegression

string = 'true:'
goldtruth = [string+str(i) for i in range(0, 58)]

string = 'pred:'
prediction = [string+str(i) for i in range(0, 58)]


def printModelAccuracy(y_test, y_pred):
    # Find the confusion matrix of the result
    cm = pd.DataFrame(confusion_matrix(y_test, y_pred, labels=list(range(0, 58))), \
        index=goldtruth, 
        columns=prediction)
    print("Confusion Matrix:")
    print(cm)
    
    cm.to_csv("confuseRandomForest.csv")
    
    # Find the accuracy and F1 score of the result
    asr = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    print("Accuracy:", asr)
    print("F1:", f1)



In [2]:
#Data pre-processing

# Takes around 2 minutes
all_df = pd.read_csv("train2.csv",header = 0)
corpus = all_df["title"]
labels = all_df["Category"]

start = time.time()
stop_list = nltk.corpus.stopwords.words('english')
stemmer = nltk.stem.porter.PorterStemmer()
def corpus2docs(corpus):
    # corpus is a object returned by load_corpus that represents a corpus.
    docs1 = []

    for title in corpus:
        doc = nltk.word_tokenize(title)
        docs1.append(doc)
    docs2 = [[w.lower() for w in doc] for doc in docs1] #lower case the words
    docs3 = [[w for w in doc if re.search('^[a-z]+$', w)] for doc in docs2] #removing special characters and numbers
    docs4 = [[w for w in doc if w not in stop_list] for doc in docs3] #removing words in stop list
    #changing list into a string
    docs5 = [' '.join([stemmer.stem(w) for w in doc]) for doc in docs4] #changing the words into its root form
    
    return docs5

docs = corpus2docs(corpus)
end = time. time()
print(end - start)
print(docs[0:10])

30.948242664337158
['iphon gold', 'cash back promo akhir thn beli grati samsung galaxi pul set wa', 'khusu hari ini garansi resmi handphon oppo selfi expert terbaru', 'big promo cuci gudang dijual xiaomi redmi mi black garansi resmi tam murah', 'vivo', 'xiaomi mi', 'cuci gudang macbook air earli wa', 'samsung ace terbaru', 'wa ke vivo second', 'promo vivo v origin resmi']


In [3]:
#Data pre-processing - vectorise + TFIDF words

start = time.time()

def convertToDataframe(listofwords, labels):
    vectorizer = TfidfVectorizer(analyzer='word') #tfidf
    words_tfidf = vectorizer.fit_transform(listofwords) #tfidf

    tablecolumns = []                      
    tablecolumns.append(vectorizer.get_feature_names()) #adding column headers

    df = pd.DataFrame(words_tfidf.toarray(), columns=tablecolumns) #creating dataframe

    df['Category'] = labels
                      
    return df


df = convertToDataframe(docs, labels.values.tolist())
print(df.head(10))

end = time. time()
print(end - start)

docs = "" #clear memory

    aa  aaa  aad  aak  aap  aaw   ab ababa abadi abang   ...     zvr zwain  \
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0   0.0   0.0   ...     0.0   0.0   
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0   0.0   0.0   ...     0.0   0.0   
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0   0.0   0.0   ...     0.0   0.0   
3  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0   0.0   0.0   ...     0.0   0.0   
4  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0   0.0   0.0   ...     0.0   0.0   
5  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0   0.0   0.0   ...     0.0   0.0   
6  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0   0.0   0.0   ...     0.0   0.0   
7  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0   0.0   0.0   ...     0.0   0.0   
8  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0   0.0   0.0   ...     0.0   0.0   
9  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0   0.0   0.0   ...     0.0   0.0   

  zwetta zyfpg  zyh zyrex   zz zzsykd zztim Category  
0    0.0   0.0  0.0   0.0  0.0    0.0   0.0       31  
1    0.0   0.0  0.0   0.0  0.0 

In [4]:
#Data partitioning into train, test and validation set

X = df.loc[:, df.columns != 'Category'] #take everything except Category
y = df[['Category']] #our label is Category

#Split dataset into two groups - the training set and the test set
#Test size set to 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [5]:
#Build, train and test a model using Random Forest classifier
#Takes around 2 mins

#Instantiate model with 500 decision trees
rf = RandomForestClassifier(n_estimators = 500, random_state = 0, max_depth = 2)

#Fit the training feature Xs and training label Ys
rf.fit(X_train, y_train.values.ravel())

#Use the trained model to predict the test data
y_pred = rf.predict(X_test)

# Find the confusion matrix, the accuracy, and F1 score of the result
printModelAccuracy(y_test, y_pred)

Confusion Matrix:
         pred:0  pred:1  pred:2  pred:3  pred:4  pred:5  pred:6  pred:7  \
true:0        0       0       0      58       0       0       0       0   
true:1        0       0       0     426       0       0       0       0   
true:2        0       0       0     156       0       0       0       0   
true:3        0       0       0    1093       0       0       0       0   
true:4        0       0       0     619       0       0       0       0   
true:5        0       0       0     747       0       0       0       0   
true:6        0       0       0      40       0       0       0       0   
true:7        0       0       0     162       0       0       0       0   
true:8        0       0       0      72       0       0       0       0   
true:9        0       0       0     105       0       0       0       0   
true:10       0       0       0      19       0       0       0       0   
true:11       0       0       0      59       0       0       0       0   
true:12

  'precision', 'predicted', average, warn_for)
