# Import Statements

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import model_selection
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier


# Importing Data

In [2]:
# The feature names are not stored in the csv so i created a list of the features  
headers = ["age","workclass","fnlwgt","education","education-num","marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week","native-country", "class"]
df = pd.read_csv("adult.data", sep=",", names = headers, index_col = False)

In [3]:
# The categorical variables will cause issues with various methods as they cannot accept string values
# to remedy this I used one hot encoding to turn them into multiple binary variables
# I created a list of the categorical variables to do this
categorical_vars = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]
one_hot_encoding = pd.get_dummies(df[categorical_vars])

# adding the one hot encoding variables to the original data frame 
# and dropping the original categorical variables
df = pd.concat([df, one_hot_encoding], axis = 'columns')
df = df.drop(categorical_vars, axis = 1)

# Setting Dependent/Independent Variables

In [4]:
y = df["class"]
#class is set up as a categorical varaible with values of <=50K and >50K
# I addressed his by assingning a 1 for >50K and a 0 for <=50K
#used a temp dataframe with binary variables for >50K and <=50K
# pulled out the >50K column as the dependent variable
temp = pd.get_dummies(y)
y = temp[" >50K"]
#Setting the attributes by dropping the class variable
X = df.drop("class", axis = 1)

# Ensamble with Hard Voting

In [9]:
# Using decision tree, random forest, and SVM
tree_clf_hard = DecisionTreeClassifier()
rnd_clf_hard = RandomForestClassifier(n_estimators=100)
svm_clf_hard= SVC(gamma='scale')
voting_clf_hard = VotingClassifier(
    estimators=[('tree', tree_clf_hard),('rf', rnd_clf_hard), ('svc', svm_clf_hard)],
    voting='hard')

In [10]:
#Evaluating the model using 10-fold CV with the scoring as accuracy
result1 = model_selection.cross_val_score(voting_clf_hard, X, y, cv=10, scoring='accuracy')
print("accuracy:",result1.mean())
# the accuracy here is 0.849

accuracy: 0.8492678781441446


# Ensamble with Soft Voting

In [11]:
# using Decision tree, random forest, and SVM
tree_clf_soft = DecisionTreeClassifier()
rnd_clf_soft = RandomForestClassifier(n_estimators=100)
svm_clf_soft = SVC(gamma='scale', probability=True)
voting_clf_soft = VotingClassifier(
    estimators=[('tree', tree_clf_soft), ('rf', rnd_clf_soft), ('svc', svm_clf_soft)],
    voting='soft')

In [12]:
#Evaluating the model using 10-fold CV with the scoring as accuracy
result2 = model_selection.cross_val_score(voting_clf_soft, X, y, cv=10, scoring='accuracy')
print("accuracy:",result2.mean())
# The accuracy here is 0.841 so slighty lower than hard voting

accuracy: 0.8412212611053569


# Bagging

In [13]:
# 100 classifiers and 500 instances using all features 
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=100,
    max_samples=500, bootstrap=True, n_jobs=-1)
#Evaluating the model using 10-fold CV with the scoring as accuracy
result3 = model_selection.cross_val_score(bag_clf, X, y, cv=10, scoring="accuracy")
print("Accuracy:", result3.mean())
#accuracy here is 0.857 which is higher than both hard and soft voting

Accuracy: 0.8572835186206387


In [14]:
#Bagging with 100 classifiers and 500 instances using max_features = 5
bag_clf1 = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=100,
    max_samples=500, bootstrap=True, n_jobs=-1, max_features = 5)
#Evaluating the model using 10-fold CV with the scoring as accuracy
result4 = model_selection.cross_val_score(bag_clf1, X, y, cv=10, scoring="accuracy")
print("Accuracy:", result4.mean())
#accuracy here is 0.759 which is about .1 lower than with no limit with max_features 

Accuracy: 0.7591904489970196


# Gradient Boost Classifier

### First changing values for learning rate

In [15]:
# Using max_depth=5, n_estimators=5, learning_rate=1.0
gbrt = GradientBoostingClassifier(max_depth = 5, n_estimators=5, learning_rate=1.0)
#Evaluating the model using 10-fold CV with the scoring as accuracy
result5 = model_selection.cross_val_score(gbrt, X, y, cv=10, scoring="accuracy")
print("Accuracy:", result5.mean())
#accuracy is 0.8563
#accuracy with max_depth = 5 --> 0.861


Accuracy: 0.8609381306111427


In [26]:
# Using max_depth=5, n_estimators=5, learning_rate=0.5
gbrt = GradientBoostingClassifier(max_depth = 5, n_estimators=5, learning_rate=0.5)
#Evaluating the model using 10-fold CV with the scoring as accuracy
result6 = model_selection.cross_val_score(gbrt, X, y, cv=10, scoring="accuracy")
print("Accuracy:", result6.mean())
#accuracy is 0.856
#accuracy with max_depth = 5 --> 0.8607


Accuracy: 0.8607232466228474


In [31]:
# Using n_estimators=5, learning_rate=0.0001 
gbrt = GradientBoostingClassifier(max_depth = 5,n_estimators=5, learning_rate=0.0001)
#Evaluating the model using 10-fold CV with the scoring as accuracy
result7 = model_selection.cross_val_score(gbrt, X, y, cv=10, scoring="accuracy")
print("Accuracy:", result7.mean())
#accuracy is 0.759
#accuracy with max_depth = 5 --> 0.759

Accuracy: 0.7591904489970196


#### Analysis
ceterus paribus, lowering the learning_rate lowers the accuracy. You can also see that 
including max_depth as 5 increases the accuracy of each model in which it previously 
was not stated and it also increases between models as learning rate increases.

### Now changing values for n_estimators

In [33]:
# Using n_estimators=1, learning_rate=1.0
gbrt = GradientBoostingClassifier(max_depth = 5, n_estimators=1, learning_rate=1.0)
#Evaluating the model using 10-fold CV with the scoring as accuracy
result8 = model_selection.cross_val_score(gbrt, X, y, cv=10, scoring="accuracy")
print("Accuracy:", result8.mean())
#accuracy is 0.844
#accuracy with max_depth = 5 --> 0.849


Accuracy: 0.8485921553199723


In [40]:
# Using n_estimators=10, learning_rate=1.0
gbrt = GradientBoostingClassifier(max_depth = 5, n_estimators=10, learning_rate=1.0)
#Evaluating the model using 10-fold CV with the scoring as accuracy
result9 = model_selection.cross_val_score(gbrt, X, y, cv=10, scoring="accuracy")
print("Accuracy:", result9.mean())
#accuracy is 0.862
#accuracy with max_depth = 5 --> 0.863


Accuracy: 0.8634257418721649


In [42]:
# Using n_estimators=50, learning_rate=1.0
gbrt = GradientBoostingClassifier(max_depth = 5, n_estimators=50, learning_rate=1.0)
#Evaluating the model using 10-fold CV with the scoring as accuracy
result10 = model_selection.cross_val_score(gbrt, X, y, cv=10, scoring="accuracy")
print("Accuracy:", result10.mean())
#accuracy is 0.868
#accuracy with max_depth = 5 --> 0.858

Accuracy: 0.857774683369556


In [44]:
# Using n_estimators=100, learning_rate=1.0
gbrt = GradientBoostingClassifier(max_depth = 5, n_estimators=100, learning_rate=1.0)
#Evaluating the model using 10-fold CV with the scoring as accuracy
result11 = model_selection.cross_val_score(gbrt, X, y, cv=10, scoring="accuracy")
print("Accuracy:", result11.mean())
#accuracy is 0.864
#accuracy with max_depth = 5 --> 0.853

Accuracy: 0.8530145522891915


#### Analysis 
In the case of changing the value of n_estimators, it appears that, ceterus paribus, 
as n_estimators increases so does accuracy but it also looks like it reaches a point
where accuracy starts decreasing since the model with n_estimators=50 is higher 
than the model with n_estimators=100. We also see here that as you increase 
n_estimators, limiting max_depth to 5 appears to lower the accuracy, which is the 
opposite of what was occuring with a low n_estimators and any value for learning_rate.
Next step here will be to try high values n_estimators with low values of learning_rate

# Changing n_estimators and learning_rate

In [11]:
# Using n_estimators=100, learning_rate=0.0001
gbrt = GradientBoostingClassifier(max_depth = 5, n_estimators=100, learning_rate=0.0001)
#Evaluating the model using 10-fold CV with the scoring as accuracy
result12 = model_selection.cross_val_score(gbrt, X, y, cv=10, scoring="accuracy")
print("Accuracy:", result12.mean())
#accuracy is 0.759
#accuracy with max_depth = 5 --> 0.759

Accuracy: 0.7591904489970196


#### 
here we can see that a large value for n_estimators and a really small learning_rate
reduces the accuray by almost 0.10 compared to the same n_estimators with a 1.0 
learning_rate