<a href="https://colab.research.google.com/github/19PA1A0201/AI-Lab/blob/master/EXP_9_Ensemble_based_approach.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Ensemble based approaches

## 1) Import the packages

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

## 2) Import the dataset

In [None]:
dataset=pd.read_csv("students_placement_data.csv")
dataset.head()

Unnamed: 0,Roll No,Gender,Section,SSC Percentage,inter_Diploma_percentage,B.Tech_percentage,Backlogs,registered_for_ Placement_Training,placement status
0,1,M,A,87.3,65.3,40.0,18,NO,Not placed
1,2,F,A,89.0,92.4,71.45,0,yes,Placed
2,3,F,A,67.0,68.0,45.26,13,yes,Not placed
3,4,M,A,71.0,70.4,36.47,17,yes,Not placed
4,5,M,A,,65.5,42.52,17,yes,Not placed


## 3) Data preprocessing 

### 3.1) Check for missing values and fill it

In [None]:
dataset.isna().any()

Roll No                               False
Gender                                False
Section                               False
SSC Percentage                         True
inter_Diploma_percentage              False
B.Tech_percentage                     False
Backlogs                              False
registered_for_ Placement_Training     True
placement status                      False
dtype: bool

We see that "SSC Percentage" and "registered_for_ Placement_Training" are having missing values

### 3.1.1) We replace the missing values in "SSC Percentage" with mean SSC percentage of all students.

In [None]:
dataset["SSC Percentage"]=dataset["SSC Percentage"].replace(to_replace=np.nan,value=dataset["SSC Percentage"].mean())  
dataset.head()

Unnamed: 0,Roll No,Gender,Section,SSC Percentage,inter_Diploma_percentage,B.Tech_percentage,Backlogs,registered_for_ Placement_Training,placement status
0,1,M,A,87.3,65.3,40.0,18,NO,Not placed
1,2,F,A,89.0,92.4,71.45,0,yes,Placed
2,3,F,A,67.0,68.0,45.26,13,yes,Not placed
3,4,M,A,71.0,70.4,36.47,17,yes,Not placed
4,5,M,A,80.474569,65.5,42.52,17,yes,Not placed


### 3.1.2) We replace missing values in "registered_for_ Placement_Training" with the most frequent value in "registered_for_ Placement_Training" column. 

In [None]:
dataset["registered_for_ Placement_Training"]=dataset["registered_for_ Placement_Training"].replace(to_replace=np.nan,value=dataset["registered_for_ Placement_Training"].mode()[0])  
dataset.head()

Unnamed: 0,Roll No,Gender,Section,SSC Percentage,inter_Diploma_percentage,B.Tech_percentage,Backlogs,registered_for_ Placement_Training,placement status
0,1,M,A,87.3,65.3,40.0,18,NO,Not placed
1,2,F,A,89.0,92.4,71.45,0,yes,Placed
2,3,F,A,67.0,68.0,45.26,13,yes,Not placed
3,4,M,A,71.0,70.4,36.47,17,yes,Not placed
4,5,M,A,80.474569,65.5,42.52,17,yes,Not placed


###  3.1.3) We check for missing values again and make sure that there are no missing values.

In [None]:
dataset.isna().any()

Roll No                               False
Gender                                False
Section                               False
SSC Percentage                        False
inter_Diploma_percentage              False
B.Tech_percentage                     False
Backlogs                              False
registered_for_ Placement_Training    False
placement status                      False
dtype: bool

### 3.2) Convert the strings into numbers

###  Most of the classification algorithms cannot handle string information. So, Its better to change the string into numbers

In [None]:
dataset['Gender'] = dataset['Gender'].map({'M': 0, 'F': 1})
dataset['Section']=dataset['Section'].map({'A':0,'B':1})
dataset['registered_for_ Placement_Training']=dataset['registered_for_ Placement_Training'].map({'NO':0,'yes':1})

### Now have a look at our dataset again.

In [None]:
dataset.head()

Unnamed: 0,Roll No,Gender,Section,SSC Percentage,inter_Diploma_percentage,B.Tech_percentage,Backlogs,registered_for_ Placement_Training,placement status
0,1,0,0,87.3,65.3,40.0,18,0,Not placed
1,2,1,0,89.0,92.4,71.45,0,1,Placed
2,3,1,0,67.0,68.0,45.26,13,1,Not placed
3,4,0,0,71.0,70.4,36.47,17,1,Not placed
4,5,0,0,80.474569,65.5,42.52,17,1,Not placed


## 4) Divide the data into features and labels.
We are considering __"Gender","Section","SSC  Percentage","inter_Diploma_percentage","B.Tech_percentage","Backlogs", "registered_for_ Placement_Training"__ as our features and __"placement status"__ as our label.

In [None]:
# features
X=dataset[["Gender","Section","SSC Percentage","inter_Diploma_percentage","B.Tech_percentage","Backlogs","registered_for_ Placement_Training"]].values

In [None]:
# Labels
y=dataset["placement status"]

## 5) Now split the data in training data and test data.
Note:training data is used to built the model and test data is used to evaluate the model.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

# Now, Let's apply different models

#  <span style="color:red"> <center> 1) Random forest </centre> </span>

In [None]:
# Create Decision Tree classifer object. Note: We will build 100 weak classifier- Decision trees
rf_model=RandomForestClassifier(n_estimators=100) 

In [None]:
rf_model.fit(X_train,y_train) # Build the model using training data

RandomForestClassifier()

In [None]:
y_pred_rf=rf_model.predict(X_test) # Apply the model on test data.

### Draw the confusion matrix of Random forest

In [None]:
from sklearn.metrics import confusion_matrix

pd.DataFrame(
    confusion_matrix(y_test, y_pred_rf),
    columns=['Not placed Predicted', 'placed predicted'],
    index=['Not placed', 'placed']
)

Unnamed: 0,Not placed Predicted,placed predicted
Not placed,12,2
placed,3,7


In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_rm))

Accuracy: 0.8333333333333334


#  <span style="color:red"> <center> 2) Decision tree </centre> </span>

In [None]:
model_DT = DecisionTreeClassifier() # Create Decision Tree classifer object
model_DT = model_DT.fit(X_train,y_train) # Build the model using training data
y_pred_DT = model_DT.predict(X_test) # Apply the model on test data.

### Draw the confusion matrix of Decision tree

In [None]:
from sklearn.metrics import confusion_matrix

pd.DataFrame(
    confusion_matrix(y_test, y_pred_DT),
    columns=['Not placed Predicted', 'placed predicted'],
    index=['Not placed', 'placed']
)

Unnamed: 0,Not placed Predicted,placed predicted
Not placed,10,4
placed,4,6


In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_DT))

Accuracy: 0.6666666666666666


# Random forest is performing better than Decision tree

#  <span style="color:red"> <center> 3) AdaBoost </centre> </span>

In [None]:
ada_model = AdaBoostClassifier(n_estimators=50) # Note: We are building 50 weak classifiers.

In [None]:
ada_model.fit(X_train,y_train) # Build the model using training data

AdaBoostClassifier()

In [None]:
y_pred_ada=ada_model.predict(X_test) # Apply the model on test data.

### Draw the confusion matrix of Adaboost

In [None]:
from sklearn.metrics import confusion_matrix

pd.DataFrame(
    confusion_matrix(y_test, y_pred_ada),
    columns=['Not placed Predicted', 'placed predicted'],
    index=['Not placed', 'placed']
)

Unnamed: 0,Not placed Predicted,placed predicted
Not placed,12,2
placed,3,7


In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_ada))

Accuracy: 0.7916666666666666


# Adaboost is performing better than Decision trees