In [12]:
# Importing necessary libraries
import pandas as pd

from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Loading the dataset
df = pd.read_csv('income_evaluation.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
# Dropping unnecesasry features
df.drop([' workclass',' education',' marital_status',' occupation',' relationship',' race',' sex',' native_country'], axis=1 ,inplace=True)

In [4]:
df.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,income
0,39,77516,13,2174,0,40,<=50K
1,50,83311,13,0,0,13,<=50K
2,38,215646,9,0,0,40,<=50K
3,53,234721,7,0,0,40,<=50K
4,28,338409,13,0,0,40,<=50K


In [5]:
df.rename(columns={' age':'age', ' fnlwgt':'fnlwgt',' education_num':'education_num',' capital_gain':'capital_gain',
                   ' capital_loss':'capital_loss',' hours_per_week':'hours_per_week',' income':'income'}, inplace=True)

In [6]:
df.columns

Index(['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss',
       'hours_per_week', 'income'],
      dtype='object')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   fnlwgt          32561 non-null  int64 
 2   education_num   32561 non-null  int64 
 3   capital_gain    32561 non-null  int64 
 4   capital_loss    32561 non-null  int64 
 5   hours_per_week  32561 non-null  int64 
 6   income          32561 non-null  object
dtypes: int64(6), object(1)
memory usage: 1.7+ MB


In [8]:
# Separating features and target variable
x = df.drop(['income'], axis=1)
y = df['income']

In [9]:
# split dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2)

## <b/> Model Trainig and Evaluation

In [10]:
# create adaboost model classifier
AdaModel = AdaBoostClassifier(n_estimators=100, learning_rate=1)

In [11]:
# train adaboost model
model = AdaModel.fit(X_train, y_train)

#predict the response for test dataset
y_pred = model.predict(X_test)

In [13]:
# check model accuracy
acc = accuracy_score(y_test, y_pred)

print('Model Accuracy :', acc)

Model Accuracy : 0.8381698142177184


## <b/> AdaBoost using another Base Estimators

In [14]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

# train the adaboost model using logistic regression as base estimator
ada = AdaBoostClassifier(n_estimators=100, base_estimator=lr, learning_rate=1)

In [15]:
# train adaboost classifier
model = ada.fit(X_train,y_train)

# make predictions using the new adaboost model
y_pred = model.predict(X_test)

In [16]:
# check new model accuracy
acc = accuracy_score(y_test, y_pred)

print('Model Accuracy :', acc)

Model Accuracy : 0.7951788730231844


* Here we can see the new model performed bad than the previous one so we can say that Logistic Regression is not a good estimator for our predictions. We can also try different types of base estimators like decision trees, svm, etc. 