# Adult Census Income Analysis - Decision TREE, Random Forest, CV, Tuning the model with Ensemble Techniques(Baaging , ADAboost)


### A stable and optimized model to predict the income of a given population, which is labelled as <= 50K and >50K. The attributes (predictors) are age, working class type, marital status, gender, race etc.
#### Following are the steps, 
#### 1.clean and prepare the data,
#### 2.Analyze Data,
#### 3.Label Encoding,
#### 4.Build a decision tree and Random forest with default hyperparameters,
#### 5.Build several classifier models to compare, cross validate and for voting classifier model
#### 6.choose the optimal hyperparameters using grid search cross-validation.
#### 7.Build optimized Random forest model with tuned hyperparameters from grid search model
#### 8.Increase Accuracy by Applying Ensemble technique BAGGING to our tuned random forest model
#### 9.Increase Accuracy by Applying Ensemble technique ADABOOST to our tuned random forest model
####  I hope you enjoy this notebook and find it useful!

## Clean & Analyze Data,

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
# from sklearn.cross_validation import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
data =  pd.read_csv("../input/adult.csv")

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
# select all categorical variables
df_categorical = data.select_dtypes(include=['object'])

# checking whether any other columns contain a "?"
df_categorical.apply(lambda x: x=="?", axis=0).sum()

In [None]:
data[data['workclass'] == '?' ].count()

In [None]:
data[data['occupation'] == '?' ].count()

In [None]:
data[data['native.country'] == '?' ].count()

In [None]:
(1836/32561)/100

 ### Missing Value % is very insignificant  so we will drop those values

In [None]:
data.count()

In [None]:
data = data[data["workclass"] != "?" ]

In [None]:
data = data[data["occupation"] != "?" ]

In [None]:
data = data[data["native.country"] != "?" ]

In [None]:
data.count()

In [None]:
data.head()

In [None]:
data["income"].unique()

In [None]:
data["income"] = data["income"].map({'<=50K' : 0, '>50K': 1})
data.head()

In [None]:
data["income"].unique()

## Label Encoding

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

catogorical_data = data.select_dtypes(include =['object'])

In [None]:
catogorical_data.head()

In [None]:
catogorical_data = catogorical_data.apply(le.fit_transform)

In [None]:
catogorical_data.head()

In [None]:
data = data.drop(catogorical_data.columns, axis=1)
data = pd.concat([data, catogorical_data], axis=1)
data.head()

In [None]:
data.info()

In [None]:
data['income'] = data['income'].astype('category')


## Decision Tree Model with Default parameters

In [None]:
x=data.drop('income',axis=1)
y=data['income']
#Train & Test split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.30,random_state= 476)

In [None]:
tree = DecisionTreeClassifier()
model_tree = tree.fit(x_train,y_train)
model_tree

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
model_tree = tree.fit(x_train,y_train)
pred_tree = tree.predict(x_test)
a1 = accuracy_score(y_test,pred_tree)
print("The Accuracy of Desicion Tree is ", a1)

In [None]:
confusion_matrix(y_test,pred_tree)

In [None]:
print(classification_report(y_test, pred_tree))

## Random Forest Model with Default parameters

In [None]:
rf = RandomForestClassifier()
model_rf = rf.fit(x_train,y_train)
pred_rf = rf.predict(x_test)
a2 = accuracy_score(y_test, pred_rf)
print("The Accuracy of Random Forest is ", a2)

## Logistic Regression & KNN model

In [None]:
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression()

model_lg = lg.fit(x_train,y_train)
pred_lg = lg.predict(x_test)
a3 = accuracy_score(y_test, pred_lg)
print("The Accuracy of logistic regression is ", a3)

In [None]:
from sklearn.neighbors import KNeighborsClassifier 
knn = KNeighborsClassifier()

In [None]:
model_knn =knn.fit(x_train,y_train) 
pred_knn = knn.predict(x_test)
a4 = accuracy_score(y_test, pred_knn)
print("The Accuracy of KNN is ", a4)

# Build optimized Random forest model with tuned hyperparameters from grid search model  

In [None]:
rf_param = {
    "n_estimators": [25,50,100],
    "criterion" : ["gini"],
    "max_depth" : [3,4,5,6],
    "max_features" : ["auto","sqrt","log2"],
    "random_state" : [123]
}

In [None]:
GridSearchCV(rf, rf_param, cv = 5)

In [None]:
grid =GridSearchCV(rf, rf_param, cv = 5)

In [None]:
grid.fit(x_train,y_train).best_params_

In [None]:
rf1 = RandomForestClassifier(criterion = 'gini',
    max_depth = 6,
    max_features = 'auto',
    n_estimators = 100,
    random_state = 123)
model_rf1 = rf1.fit(x_train,y_train)
pred_rf1 = rf1.predict(x_test)
accuracy_score(y_test, pred_rf1)

# K FOLD Cross Validation

In [None]:
cross_val_score(tree,x_train,y_train,scoring= "accuracy", cv=10)

In [None]:
cross_val_score(tree,x,y,scoring= "accuracy", cv=5).mean()

In [None]:
cross_val_score(rf,x_train,y_train,scoring= "accuracy", cv=5).mean()

In [None]:
cross_val_score(lg,x_train,y_train,scoring= "accuracy", cv=5).mean()

In [None]:
cross_val_score(knn,x_train,y_train,scoring= "accuracy", cv=5).mean()

# Voting Classifier model

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
model_vote = VotingClassifier(estimators=[('logistic Regression', lg), ('random forrest', rf), ('knn neighbors', knn),(' decision tree', tree)], voting='soft')
model_vote = model_vote.fit(x_train, y_train)

In [None]:
vote_pred = model_vote.predict(x_test)

In [None]:
a5 =  accuracy_score(y_test, vote_pred)
print("The Accuracy of voting classifier is ", a5)

In [None]:
print(classification_report(y_test, vote_pred))

# Ensemble Technique Bagging 

## Increase Accuracy by Applying Ensemble technique BAGGING to our tuned random forest model

In [None]:
from sklearn.ensemble import BaggingClassifier

In [None]:
bagg = BaggingClassifier(base_estimator=rf1,n_estimators=15)

In [None]:
model_bagg =bagg.fit(x_train,y_train) 
pred_bagg = bagg.predict(x_test)

In [None]:
a6 = accuracy_score(y_test, pred_bagg)
print("The Accuracy of BAAGING is ", a6)

In [None]:
confusion_matrix(y_test,pred_bagg)

In [None]:
print(classification_report(y_test, pred_bagg))

#  Ensemble Technique  ADA Boost 

## Increase Accuracy by Applying Ensemble technique ADABOOST to our tuned random forest model

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
Adaboost = AdaBoostClassifier(base_estimator=rf1, n_estimators=15)

In [None]:
model_boost =Adaboost.fit(x_train,y_train) 
pred_boost = Adaboost.predict(x_test)

In [None]:
a7 = accuracy_score(y_test, pred_boost)
print("The Accuracy of BOOSTING is ", a7)

In [None]:
confusion_matrix(y_test,pred_boost)

In [None]:
print(classification_report(y_test, pred_boost))