## Random Forest 
<br/>
Random Forest Algorithm is another popular machine learning technique used in regression and classification both now why is it called random forest well the forest has trees and a tree in machine learning world means a decision tree.

First of all, Random Forest (RF) and Neural Network (NN) are different types of algorithms. The RF is the ensemble of decision trees. Each decision tree, in the ensemble, processes the sample and predicts the output label (in case of classification). Decision trees in the ensemble are independent. Each can predict the final response. The NN is a network of connected neurons. The neurons cannot operate without other neurons; they are connected. Usually, they are grouped in layers and process data in each layer and pass forward to the next layers. The last layer of neurons is making decisions.
Ref: https://jpt.spe.org/random-forests-vs-neural-networks-which-better-and-when?gclid=Cj0KCQiAkNiMBhCxARIsAIDDKNXZb6d9QmAxZP3w7jyd7BHsmb8lFWzPEEGM1AghoH1qLoJD6ujqOtUaAuf6EALw_wcB

In [1]:
# Libraries
import pandas as pd
import numpy as np
from numpy.random import seed
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
#
import warnings
warnings.filterwarnings('ignore')

### Obtain Data
We get the data from CSV file and put in a df

In [2]:
#put a seed of 10 for replicate the experiment with random values
seed(10)

# Loaded Data from CSV
def read_data_set(name):
    #delete the header
    return pd.read_csv(name, header=None).iloc[1: , :]

csv_file = '../assignment_ds.csv'
csv_file_test='../test_ds.csv'

#we read the assignment and test CSVs
df=read_data_set(csv_file)
test_df=read_data_set(csv_file_test)

#show the first 10 rows
df[0:10]

Unnamed: 0,0,1,2,3,4,5,6
1,3453,0.000621,0.0773,17.08,Red,M,0
2,12098,689.0,7.01,0.02,Blue-white,A,3
3,2731,0.000437,0.0856,18.09,Red,M,0
4,37800,202900.0,6.86,-4.56,Blue,O,3
5,8570,0.00081,0.0097,14.2,Blue white,A,2
6,3605,126000.0,1124.0,-10.81,Red,M,5
7,8052,8.7,1.8,2.42,Whitish,A,3
8,3574,200000.0,89.0,-5.24,Red,M,4
9,8829,537493.0,1423.0,-10.73,White,A,5
10,3625,74000.0,876.0,-10.25,Red,M,5


### Merge DFs
We combine them

In [3]:
#we concatenete the dataframes that means put one below other
extra_df=pd.concat([df, test_df])

#we reset the index
extra_df.index = range(extra_df.shape[0])
extra_df

Unnamed: 0,0,1,2,3,4,5,6
0,3453,0.000621,0.0773,17.08,Red,M,0
1,12098,689,7.01,0.02,Blue-white,A,3
2,2731,0.000437,0.0856,18.09,Red,M,0
3,37800,202900,6.86,-4.56,Blue,O,3
4,8570,0.00081,0.0097,14.2,Blue white,A,2
...,...,...,...,...,...,...,...
234,2935,0.00014,0.116,18.89,Red,M,
235,3340,0.0038,0.24,13.07,Red,M,
236,23095,347820,86,-5.905,Blue,O,
237,3225,0.00076,0.121,19.63,Red,M,


### Clean Data 

In [4]:
# the column of the colors we converte it to lower case
def to_lower_case(datafra):
    return datafra[4].map(lambda x: x if type(x)!=str else x.lower())

extra_df[4] = to_lower_case(extra_df)

def change_colors(datafr):
    datafr[4] = datafr[4].replace(['blue-white'], 'blue white')
    datafr[4] = datafr[4].replace(['orange-red'], 'orange red')
    datafr[4] = datafr[4].replace(['yellow-white'], 'yellow white')
    datafr[4] = datafr[4].replace(['white-yellow'], 'white yellow')
    
# Change the colors bad writed
change_colors(extra_df)

extra_df

Unnamed: 0,0,1,2,3,4,5,6
0,3453,0.000621,0.0773,17.08,red,M,0
1,12098,689,7.01,0.02,blue white,A,3
2,2731,0.000437,0.0856,18.09,red,M,0
3,37800,202900,6.86,-4.56,blue,O,3
4,8570,0.00081,0.0097,14.2,blue white,A,2
...,...,...,...,...,...,...,...
234,2935,0.00014,0.116,18.89,red,M,
235,3340,0.0038,0.24,13.07,red,M,
236,23095,347820,86,-5.905,blue,O,
237,3225,0.00076,0.121,19.63,red,M,


### Normalize Data

In [5]:
def normalize(datafr):
    # scale input data from numbers
    inputs_numbers = datafr[[0,1,2,3]]
    #transform it to a numpy array
    x = inputs_numbers.values 
    #use min max sclaer function
    min_max_scaler = MinMaxScaler()
    #apply min max
    x_scaled = min_max_scaler.fit_transform(x)
    return pd.DataFrame(x_scaled)

inputs_numbers=normalize(extra_df)

inputs_numbers

Unnamed: 0,0,1,2,3
0,0.039778,6.369052e-10,3.536069e-05,0.906817
1,0.266914,8.111416e-04,3.593344e-03,0.373358
2,0.020809,4.202868e-10,3.962039e-05,0.938399
3,0.942198,2.388689e-01,3.516361e-03,0.230144
4,0.174220,8.594100e-10,6.671828e-07,0.816760
...,...,...,...,...
234,0.026169,7.063643e-11,5.522220e-05,0.963415
235,0.036809,4.379459e-09,1.188612e-04,0.781426
236,0.555845,4.094794e-01,4.413239e-02,0.188086
237,0.033788,8.005463e-10,5.778829e-05,0.986554


### Make One Hot Encoding - Colors & Type Spectrum

In [6]:
def one_hot_encode(dataframe):    
    #we use get dummies for hot encoding that is a function of pandas for Colors and type of spectrum
    one_hot_colors = pd.get_dummies(dataframe[[4]]).reset_index(drop=True)
    one_hot_spectrum = pd.get_dummies(dataframe[[5]]).reset_index(drop=True)
    return one_hot_colors,one_hot_spectrum

one_hot_colors,one_hot_spectrum=one_hot_encode(extra_df)
one_hot_spectrum

Unnamed: 0,5_A,5_B,5_F,5_G,5_K,5_M,5_O
0,0,0,0,0,0,1,0
1,1,0,0,0,0,0,0
2,0,0,0,0,0,1,0
3,0,0,0,0,0,0,1
4,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...
234,0,0,0,0,0,1,0
235,0,0,0,0,0,1,0
236,0,0,0,0,0,0,1
237,0,0,0,0,0,1,0


### Target to One Hot Encode

In [7]:
#we use the label encoder to one hot encode the target classfication
target=df[[6]]
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(target)
# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
target = pd.DataFrame(onehot_encoded)

#we rename his columns for adding them in the data frame
target = target.rename(columns = {
   0: 'target1',
   1: 'target2',
   2: 'target3',
   3: 'target4',
   4: 'target5',
   5: 'target6',
    
}, inplace = False)
target

Unnamed: 0,target1,target2,target3,target4,target5,target6
0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...
195,0.0,0.0,1.0,0.0,0.0,0.0
196,0.0,0.0,1.0,0.0,0.0,0.0
197,0.0,0.0,0.0,1.0,0.0,0.0
198,0.0,0.0,0.0,1.0,0.0,0.0


### Move target column to the end and Join the rest

In [8]:
#We concatenate the dataframes to have the new dataframe with all clean data
new_df= pd.concat([inputs_numbers, one_hot_colors, one_hot_spectrum, target], axis=1,)

new_df

Unnamed: 0,0,1,2,3,4_blue,4_blue white,4_orange,4_orange red,4_pale yellow orange,4_red,...,5_G,5_K,5_M,5_O,target1,target2,target3,target4,target5,target6
0,0.039778,6.369052e-10,3.536069e-05,0.906817,0,0,0,0,0,1,...,0,0,1,0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.266914,8.111416e-04,3.593344e-03,0.373358,0,1,0,0,0,0,...,0,0,0,0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.020809,4.202868e-10,3.962039e-05,0.938399,0,0,0,0,0,1,...,0,0,1,0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.942198,2.388689e-01,3.516361e-03,0.230144,1,0,0,0,0,0,...,0,0,0,1,0.0,0.0,0.0,1.0,0.0,0.0
4,0.174220,8.594100e-10,6.671828e-07,0.816760,0,1,0,0,0,0,...,0,0,0,0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234,0.026169,7.063643e-11,5.522220e-05,0.963415,0,0,0,0,0,1,...,0,0,1,0,,,,,,
235,0.036809,4.379459e-09,1.188612e-04,0.781426,0,0,0,0,0,1,...,0,0,1,0,,,,,,
236,0.555845,4.094794e-01,4.413239e-02,0.188086,1,0,0,0,0,0,...,0,0,0,1,,,,,,
237,0.033788,8.005463e-10,5.778829e-05,0.986554,0,0,0,0,0,1,...,0,0,1,0,,,,,,


### Rename the columns with his index

In [9]:
#Here we change the name of the columns and put the number of his column instead
def rename_columns(datafram):    
    return datafram.rename(columns = {
        '4_blue': 4,
        '4_blue white':5,
        '4_orange':6,
        '4_orange red': 7,
        '4_pale yellow orange': 8,
        '4_red': 9,
        '4_white': 10,
        '4_white yellow': 11,
        '4_whitish': 12,
        '4_yellow white': 13,
        '4_yellowish':14,
        '4_yellowish white': 15,
        '5_A': 16,
        '5_B': 17,
        '5_F': 18,
        '5_G': 19,
        '5_K': 20,
        '5_M': 21,
        '5_O': 22    
    }, inplace = False)

df=rename_columns(new_df)
df


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,target1,target2,target3,target4,target5,target6
0,0.039778,6.369052e-10,3.536069e-05,0.906817,0,0,0,0,0,1,...,0,0,1,0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.266914,8.111416e-04,3.593344e-03,0.373358,0,1,0,0,0,0,...,0,0,0,0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.020809,4.202868e-10,3.962039e-05,0.938399,0,0,0,0,0,1,...,0,0,1,0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.942198,2.388689e-01,3.516361e-03,0.230144,1,0,0,0,0,0,...,0,0,0,1,0.0,0.0,0.0,1.0,0.0,0.0
4,0.174220,8.594100e-10,6.671828e-07,0.816760,0,1,0,0,0,0,...,0,0,0,0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234,0.026169,7.063643e-11,5.522220e-05,0.963415,0,0,0,0,0,1,...,0,0,1,0,,,,,,
235,0.036809,4.379459e-09,1.188612e-04,0.781426,0,0,0,0,0,1,...,0,0,1,0,,,,,,
236,0.555845,4.094794e-01,4.413239e-02,0.188086,1,0,0,0,0,0,...,0,0,0,1,,,,,,
237,0.033788,8.005463e-10,5.778829e-05,0.986554,0,0,0,0,0,1,...,0,0,1,0,,,,,,


In [10]:
df_train=df[:200]
df_test=df[200:]
# Reset Index because is necesary start with 0
df_test.index = range(df_test.shape[0])
df_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,target1,target2,target3,target4,target5,target6
0,0.039778,6.369052e-10,3.536069e-05,0.906817,0,0,0,0,0,1,...,0,0,1,0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.266914,8.111416e-04,3.593344e-03,0.373358,0,1,0,0,0,0,...,0,0,0,0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.020809,4.202868e-10,3.962039e-05,0.938399,0,0,0,0,0,1,...,0,0,1,0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.942198,2.388689e-01,3.516361e-03,0.230144,1,0,0,0,0,0,...,0,0,0,1,0.0,0.0,0.0,1.0,0.0,0.0
4,0.174220,8.594100e-10,6.671828e-07,0.816760,0,1,0,0,0,0,...,0,0,0,0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.301647,6.004097e-10,7.236367e-07,0.800188,0,1,0,0,0,0,...,0,0,0,0,0.0,0.0,1.0,0.0,0.0,0.0
196,0.470849,1.200819e-09,2.412122e-06,0.727330,1,0,0,0,0,0,...,0,0,0,0,0.0,0.0,1.0,0.0,0.0,0.0
197,0.291217,1.174919e-03,3.182770e-03,0.329581,0,1,0,0,0,0,...,0,0,0,0,0.0,0.0,0.0,1.0,0.0,0.0
198,0.973726,2.401639e-01,5.435795e-03,0.225766,1,0,0,0,0,0,...,0,0,0,1,0.0,0.0,0.0,1.0,0.0,0.0


### Separete column target and data

In [11]:
# X: has 0 to 22 columns / 'data'
X = df_train.drop(['target1', 'target2', 'target3', 'target4', 'target5', 'target6'],axis='columns')

# y: has target column
y = pd.DataFrame(df_train, columns=['target1', 'target2', 'target3', 'target4', 'target5', 'target6'])

#get inputs from the prediction
inputs_test = df_test.drop(['target1', 'target2', 'target3', 'target4', 'target5', 'target6'],axis='columns')

In [12]:
# Use 'train_test_split' 
# test_size=0.2 : means 20% of my sample is test data and 80% are training data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

### Now Use Random Forest Classifier to Train the Model

In [13]:
# Create model with random forest classifier 
# 30 random trees 
model = RandomForestClassifier(n_estimators=30)
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=30,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

### Display The Score 

In [14]:
# The accuracy
model.score(X_test, y_test)

0.975

### Print Results 

In [15]:
# Extract de data of the model
predictions_targets = model.predict(inputs_test)

# We desone hot encoding
predicted_RF = label_encoder.inverse_transform(predictions_targets.argmax(1))
predicted_RF

# Transform to DF
df = pd.DataFrame({'Predicted RF':predicted_RF})
df

Unnamed: 0,Predicted RF
0,2
1,0
2,3
3,1
4,1
5,1
6,3
7,5
8,5
9,2
