# Case Study 02 on Ensemble Learning

Create an ensemble learning based machine learning model to classify people based on their
salary

In [45]:
#importing libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [46]:
import sys
sys.version

'3.7.14 (default, Sep  8 2022, 00:06:44) \n[GCC 7.5.0]'

In [47]:
#reading data
df = pd.read_csv('/content/salary.csv')

In [48]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


### Preprocessing Data

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [50]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')

In [51]:
#checking number of total zeroes in columns
for column_name in df.columns:
    count = (df[column_name] == 0).sum()
    print(column_name,count)

age 0
workclass 0
fnlwgt 0
education 0
education-num 0
marital-status 0
occupation 0
relationship 0
race 0
sex 0
capital-gain 29849
capital-loss 31042
hours-per-week 0
native-country 0
salary 0


In [52]:
#unique values in each columns
print(df['workclass'].unique())
print(df['education'].unique())
print(df['marital-status'].unique())
print(df['occupation'].unique())
print(df['race'].unique())
print(df['sex'].unique())
print(df['native-country'].unique())

[' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' ?' ' Self-emp-inc' ' Without-pay' ' Never-worked']
[' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college'
 ' Assoc-acdm' ' Assoc-voc' ' 7th-8th' ' Doctorate' ' Prof-school'
 ' 5th-6th' ' 10th' ' 1st-4th' ' Preschool' ' 12th']
[' Never-married' ' Married-civ-spouse' ' Divorced'
 ' Married-spouse-absent' ' Separated' ' Married-AF-spouse' ' Widowed']
[' Adm-clerical' ' Exec-managerial' ' Handlers-cleaners' ' Prof-specialty'
 ' Other-service' ' Sales' ' Craft-repair' ' Transport-moving'
 ' Farming-fishing' ' Machine-op-inspct' ' Tech-support' ' ?'
 ' Protective-serv' ' Armed-Forces' ' Priv-house-serv']
[' White' ' Black' ' Asian-Pac-Islander' ' Amer-Indian-Eskimo' ' Other']
[' Male' ' Female']
[' United-States' ' Cuba' ' Jamaica' ' India' ' ?' ' Mexico' ' South'
 ' Puerto-Rico' ' Honduras' ' England' ' Canada' ' Germany' ' Iran'
 ' Philippines' ' Italy' ' Poland' ' Columbia' ' Cambodia' ' Thailand'
 ' E

In [53]:
#finding the columns having ' ?'
for column_name in df.columns:
    count = (df[column_name] == ' ?').sum()
    print(column_name,count)

age 0
workclass 1836
fnlwgt 0
education 0
education-num 0
marital-status 0
occupation 1843
relationship 0
race 0
sex 0
capital-gain 0
capital-loss 0
hours-per-week 0
native-country 583
salary 0


In [54]:
#droping columns with ' ?'
df = df.drop(index = df.loc[df['occupation'] == ' ?'].index)

In [55]:
for column_name in df.columns:
    count = (df[column_name] == ' ?').sum()
    print(column_name,count)

age 0
workclass 0
fnlwgt 0
education 0
education-num 0
marital-status 0
occupation 0
relationship 0
race 0
sex 0
capital-gain 0
capital-loss 0
hours-per-week 0
native-country 556
salary 0


In [56]:
df = df.drop(index = df.loc[df['native-country'] == ' ?'].index)

In [57]:
for column_name in df.columns:
    count = (df[column_name] == ' ?').sum()
    print(column_name,count)

age 0
workclass 0
fnlwgt 0
education 0
education-num 0
marital-status 0
occupation 0
relationship 0
race 0
sex 0
capital-gain 0
capital-loss 0
hours-per-week 0
native-country 0
salary 0


In [58]:
#Encoding using Onehot

df = pd.get_dummies(data = df,columns = ['sex'])

In [59]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,capital-gain,capital-loss,hours-per-week,native-country,salary,sex_ Female,sex_ Male
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,2174,0,40,United-States,<=50K,0,1
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,0,0,13,United-States,<=50K,0,1
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,0,0,40,United-States,<=50K,0,1
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,0,0,40,United-States,<=50K,0,1
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,0,0,40,Cuba,<=50K,1,0


In [40]:
df.salary.unique()

array([' <=50K', ' >50K'], dtype=object)

In [60]:
#Applying Label Encoding

from sklearn.preprocessing import LabelEncoder

df[['workclass','education',
    'marital-status','occupation',
   'relationship','race',
    'native-country','salary']] = df[['workclass','education',
                             'marital-status','occupation',
                             'relationship','race','native-country','salary']].apply(LabelEncoder().fit_transform)



In [61]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,capital-gain,capital-loss,hours-per-week,native-country,salary,sex_ Female,sex_ Male
0,39,5,77516,9,13,4,0,1,4,2174,0,40,38,0,0,1
1,50,4,83311,9,13,2,3,0,4,0,0,13,38,0,0,1
2,38,2,215646,11,9,0,5,1,4,0,0,40,38,0,0,1
3,53,2,234721,1,7,2,5,0,2,0,0,40,38,0,0,1
4,28,2,338409,9,13,2,9,5,2,0,0,40,4,0,1,0


In [41]:
#df = pd.get_dummies(data = df,columns = ['salary'])

In [42]:
#df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,capital-gain,capital-loss,hours-per-week,native-country,sex_ Female,sex_ Male,salary_ <=50K,salary_ >50K
0,39,5,77516,9,13,4,0,1,4,2174,0,40,38,0,1,1,0
1,50,4,83311,9,13,2,3,0,4,0,0,13,38,0,1,1,0
2,38,2,215646,11,9,0,5,1,4,0,0,40,38,0,1,1,0
3,53,2,234721,1,7,2,5,0,2,0,0,40,38,0,1,1,0
4,28,2,338409,9,13,2,9,5,2,0,0,40,4,1,0,1,0


### Features and label Extraction

In [62]:
X = df.drop('salary', axis = 1)
Y = df.salary

### Test and Train data Formation

In [63]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.3, random_state = 42)


## Prediction using different models
### 1.Decision Tree

In [64]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,Y_train)

In [65]:
Y_pred = clf.predict(X_test)

In [66]:
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(Y_test,Y_pred))

Accuracy: 0.8060559177809703


### 2. Random forest

In [67]:
from sklearn.ensemble import RandomForestClassifier
  
RF_clf = RandomForestClassifier(n_estimators = 100) 
  
RF_clf.fit(X_train, Y_train)

Y_pred = RF_clf.predict(X_test)

In [69]:
print("Accuracy:",metrics.accuracy_score(Y_test,Y_pred))

Accuracy: 0.8534644712122886


## Bagging Classifires

### 1. Decision Tree 

In [70]:
from sklearn.ensemble import BaggingClassifier

tree = DecisionTreeClassifier()

bagging_clf = BaggingClassifier(base_estimator=tree, n_estimators=100, random_state = 42)

bagging_clf.fit(X_train,Y_train)


BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100,
                  random_state=42)

In [71]:
Y_pred = bagging_clf.predict(X_test)

In [72]:
print("Accuracy:",metrics.accuracy_score(Y_test,Y_pred))

Accuracy: 0.848712564924301


### 2. SVM

In [73]:
from sklearn.svm import SVC

bagging_clf = BaggingClassifier(base_estimator=SVC(), n_estimators=100, random_state = 42)

bagging_clf.fit(X_train,Y_train)

Y_pred = bagging_clf.predict(X_test)

In [74]:
print("Accuracy:",metrics.accuracy_score(Y_test,Y_pred))

Accuracy: 0.7813018013040115


### Boosting Classifiers - ADABoost

In [75]:
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(n_estimators=100)

clf = clf.fit(X_train,Y_train)

Y_pred = clf.predict(X_test)

In [76]:
print("Accuracy:",metrics.accuracy_score(Y_test,Y_pred))

Accuracy: 0.8600950381257597




Submitted by Ajuma Mohammed, AI-ML Aug22 Batch