# Instantiating and configuring our Ensemble Classifier

from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression()
rnd_for_cl = RandomForestClassifier()
svm_clf = SVC()

voting_clf =  VotingClassifier(estimators = [('lr', log_clf ), ('rf', rnd_for), ('sc', svm_clf )], voting = "hard")

from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_for_clf, svm_clf, voting_clf):
    clf,fit(Xtrain, y_train)
    print (clf.__class_.name, accuracy_score(y_test, y_pred))

In [22]:
import pandas as pd
import numpy as np 
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [23]:
data_import = pd.read_csv("Placement_Data_Full_Class.csv")
data_import

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.00,Others,91.00,Others,Commerce,58.00,Sci&Tech,No,55.0,Mkt&HR,58.80,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.00,Central,68.00,Central,Arts,64.00,Comm&Mgmt,No,75.0,Mkt&Fin,57.80,Placed,250000.0
3,4,M,56.00,Central,52.00,Central,Science,52.00,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.80,Central,73.60,Central,Commerce,73.30,Comm&Mgmt,No,96.8,Mkt&Fin,55.50,Placed,425000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,211,M,80.60,Others,82.00,Others,Commerce,77.60,Comm&Mgmt,No,91.0,Mkt&Fin,74.49,Placed,400000.0
211,212,M,58.00,Others,60.00,Others,Science,72.00,Sci&Tech,No,74.0,Mkt&Fin,53.62,Placed,275000.0
212,213,M,67.00,Others,67.00,Others,Commerce,73.00,Comm&Mgmt,Yes,59.0,Mkt&Fin,69.72,Placed,295000.0
213,214,F,74.00,Others,66.00,Others,Commerce,58.00,Comm&Mgmt,No,70.0,Mkt&HR,60.23,Placed,204000.0


In [24]:
for column in data_import.columns:
    print(column, data_import[column].dtype, len(data_import[column].unique()))

sl_no int64 215
gender object 2
ssc_p float64 103
ssc_b object 2
hsc_p float64 97
hsc_b object 2
hsc_s object 3
degree_p float64 89
degree_t object 3
workex object 2
etest_p float64 100
specialisation object 2
mba_p float64 205
status object 2
salary float64 46


Data Cleansing:
as usual, we need to clean our data before we can use it in our machine learning algorithm

1. drop the columns we cant predict with it and drop salary as we are not going to use it
2. Label Encode.
3. One Hot Encode.

Drop unimportant columns

In [25]:
dropped_columns = data_import.copy()

dropped_columns = dropped_columns.drop(["sl_no" , "salary"], axis=1) #we drop the sl_no because it is just the row number, 
                                                                     #we dont need salary as we classified if the person has a job or not so it has no meaning to use it
dropped_columns

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status
0,M,67.00,Others,91.00,Others,Commerce,58.00,Sci&Tech,No,55.0,Mkt&HR,58.80,Placed
1,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed
2,M,65.00,Central,68.00,Central,Arts,64.00,Comm&Mgmt,No,75.0,Mkt&Fin,57.80,Placed
3,M,56.00,Central,52.00,Central,Science,52.00,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed
4,M,85.80,Central,73.60,Central,Commerce,73.30,Comm&Mgmt,No,96.8,Mkt&Fin,55.50,Placed
...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,M,80.60,Others,82.00,Others,Commerce,77.60,Comm&Mgmt,No,91.0,Mkt&Fin,74.49,Placed
211,M,58.00,Others,60.00,Others,Science,72.00,Sci&Tech,No,74.0,Mkt&Fin,53.62,Placed
212,M,67.00,Others,67.00,Others,Commerce,73.00,Comm&Mgmt,Yes,59.0,Mkt&Fin,69.72,Placed
213,F,74.00,Others,66.00,Others,Commerce,58.00,Comm&Mgmt,No,70.0,Mkt&HR,60.23,Placed


Data encoding

In [26]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
label_encoded_data = dropped_columns.copy()

columns__to_label_encode = ["gender" , "workex" , "ssc_b" , "hsc_b" , "specialisation" ]  

label_encoded_data[columns__to_label_encode] = label_encoded_data[columns__to_label_encode].apply(le.fit_transform)

label_encoded_data

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status
0,1,67.00,1,91.00,1,Commerce,58.00,Sci&Tech,0,55.0,1,58.80,Placed
1,1,79.33,0,78.33,1,Science,77.48,Sci&Tech,1,86.5,0,66.28,Placed
2,1,65.00,0,68.00,0,Arts,64.00,Comm&Mgmt,0,75.0,0,57.80,Placed
3,1,56.00,0,52.00,0,Science,52.00,Sci&Tech,0,66.0,1,59.43,Not Placed
4,1,85.80,0,73.60,0,Commerce,73.30,Comm&Mgmt,0,96.8,0,55.50,Placed
...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,1,80.60,1,82.00,1,Commerce,77.60,Comm&Mgmt,0,91.0,0,74.49,Placed
211,1,58.00,1,60.00,1,Science,72.00,Sci&Tech,0,74.0,0,53.62,Placed
212,1,67.00,1,67.00,1,Commerce,73.00,Comm&Mgmt,1,59.0,0,69.72,Placed
213,0,74.00,1,66.00,1,Commerce,58.00,Comm&Mgmt,0,70.0,1,60.23,Placed


In [27]:
hot_encoded_data = label_encoded_data.copy()

hot_encoded_data_y_placeholder = hot_encoded_data["status"] #Removes the prediction column so that we don't encode it

hot_encoded_data = hot_encoded_data.drop("status", axis=1) #drop the status so we dont  included the goal(answer) column in the one-hot encoding step, avoiding accidentally gave the model the correct answer as part of the input features.

hot_encoded_data = pd.get_dummies(hot_encoded_data) #pd.get_dummies() is a Pandas function.
                                                    #It automatically applies One-Hot Encoding to categorical columns.
                                                    #It creates new binary (0/1) columns — one for each unique category.
hot_encoded_data = pd.concat ([hot_encoded_data, hot_encoded_data_y_placeholder], axis = 1) #this code recombine (concatenates) the hot_encoded_data with the columns in hot_encoded_data_y_placeholder.
hot_encoded_data

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,degree_p,workex,etest_p,specialisation,mba_p,hsc_s_Arts,hsc_s_Commerce,hsc_s_Science,degree_t_Comm&Mgmt,degree_t_Others,degree_t_Sci&Tech,status
0,1,67.00,1,91.00,1,58.00,0,55.0,1,58.80,False,True,False,False,False,True,Placed
1,1,79.33,0,78.33,1,77.48,1,86.5,0,66.28,False,False,True,False,False,True,Placed
2,1,65.00,0,68.00,0,64.00,0,75.0,0,57.80,True,False,False,True,False,False,Placed
3,1,56.00,0,52.00,0,52.00,0,66.0,1,59.43,False,False,True,False,False,True,Not Placed
4,1,85.80,0,73.60,0,73.30,0,96.8,0,55.50,False,True,False,True,False,False,Placed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,1,80.60,1,82.00,1,77.60,0,91.0,0,74.49,False,True,False,True,False,False,Placed
211,1,58.00,1,60.00,1,72.00,0,74.0,0,53.62,False,False,True,False,False,True,Placed
212,1,67.00,1,67.00,1,73.00,1,59.0,0,69.72,False,True,False,True,False,False,Placed
213,0,74.00,1,66.00,1,58.00,0,70.0,1,60.23,False,True,False,True,False,False,Placed


In [28]:
from sklearn.model_selection import train_test_split

data = hot_encoded_data.copy()

X = data.iloc[: , : -1] # take all columns except the last columns
y = data.iloc[: , -1]   #take the last column in scaled_data then store it in the scaled_data_temp.


#apply train test split before the scaling to prevent data Leakage

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33)
X_train

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,degree_p,workex,etest_p,specialisation,mba_p,hsc_s_Arts,hsc_s_Commerce,hsc_s_Science,degree_t_Comm&Mgmt,degree_t_Others,degree_t_Sci&Tech
23,0,77.4,1,60.00,1,64.74,1,92.0,0,63.62,False,False,True,False,False,True
176,0,59.0,0,60.00,1,56.00,0,55.0,1,57.90,False,True,False,True,False,False
98,0,69.0,0,73.00,0,65.00,0,70.0,0,57.31,False,True,False,True,False,False
139,1,77.0,0,70.00,0,59.00,1,58.0,0,54.43,False,True,False,True,False,False
158,1,67.0,1,63.00,1,64.00,0,60.0,0,61.87,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27,1,63.0,1,67.00,1,66.00,0,68.0,1,57.69,False,True,False,True,False,False
145,1,89.4,1,65.66,1,71.25,0,72.0,1,63.23,False,False,True,False,False,True
29,1,62.0,0,67.00,0,58.00,0,77.0,0,51.29,False,True,False,True,False,False
89,0,84.0,1,75.00,1,69.00,1,62.0,1,62.36,False,False,True,False,False,True


Data scaling


In [29]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

#scaled_data = hot_encoded_data.copy()
#scaled_data_temp = scaled_data.iloc[:, -1] #take the last column in scaled_data then store it in the scaled_data_temp.
#scaled_data = scaled_data.iloc[:, : -1] # take all columns except the last columns

#scaled_data = scaler.fit_transform(scaled_data) #fit() : Learns the parameters (mean and standard deviation) from the data X. But does NOT transform the data.
                                                #transform(): standardize each feature (column) of your data
                                                #fit_transform Learns from the data (same as fit()), and Immediately applies the transformation.

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


Train Test Split

Run ML Algorithms

In [30]:
#Instanting and configuring our Ensemble Classifier

from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression()
rnd_for_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(estimators= [('lr' , log_clf ) , ('rf' , rnd_for_clf) , ('sc' , svm_clf)] , voting = "hard" )

In [31]:
# Training the Ensemble classifier

voting_clf.fit(X_train , y_train)

In [32]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_for_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test,y_pred))

LogisticRegression 0.9154929577464789
RandomForestClassifier 0.9295774647887324
SVC 0.8873239436619719
VotingClassifier 0.9436619718309859


Bagging/pasting classifier code in the notes:

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap= True, n_jobs=1, 
    oob_score=True) #Bootstrap determines wether this is bagging or pasting
y_pred = bag_clf.predict(X_test)

bag_clf.oob_score_ #shows the expected test accuracy

bag_clf.oob_decision_function_ #This will give us the probabailiteies for ever y training instance


Random Forests

In [33]:
##create a random forest classifier and see the importance of each variable

from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rf_clf.fit(X_train, y_train)

for name, score in zip (hot_encoded_data.iloc[:, :-1].columns, rf_clf.feature_importances_ ):
    print(name, ": ", score)

gender :  0.028801605904441394
ssc_p :  0.2910214726005819
ssc_b :  0.018528248625796905
hsc_p :  0.17051521266582778
hsc_b :  0.01135028134340808
degree_p :  0.16533939306371417
workex :  0.028802296386942032
etest_p :  0.08384359644564313
specialisation :  0.0329599815413682
mba_p :  0.10945660517421808
hsc_s_Arts :  0.0045081771304618665
hsc_s_Commerce :  0.014408292471526418
hsc_s_Science :  0.013203771031143941
degree_t_Comm&Mgmt :  0.01139404465246685
degree_t_Others :  0.006134431490311571
degree_t_Sci&Tech :  0.009732589472147604


AdaBoosting

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier( max_depth = 1), n_estimators= 200, algorithm="SAMME.R", learning_rate=0.5
)
ada_clf.fit(X_train,y_train)

Gradient Boosting Classifier

In [34]:
from sklearn.ensemble import GradientBoostingClassifier

gbrt = GradientBoostingClassifier(max_depth = 2 , n_estimators= 120)

gbrt.fit(X_train, y_train)

y_pred = gbrt.predict(X_test)

accuracy_score(y_test, y_pred)

0.9295774647887324