# Congressional Voting Classification

#Objective
The main objective is to predict whether congressmen is Democrat or Republican based on voting patterns by using the decision tree with the adaboost.

#Adaboost
AdaBoost is an ensemble learning method (also known as “meta-learning”) which was initially created to increase the efficiency of binary classifiers. AdaBoost uses an iterative approach to learn from the mistakes of weak classifiers, and turn them into strong ones.


#Data Set
This data set includes votes for each of the U.S. House of Representatives Congressmen on the 16 key votes identified by the CQA. The CQA lists nine different types of votes: voted for, paired for, and announced for (these three simplified to yea), voted against, paired against, and announced against (these three simplified to nay), voted present, voted present to avoid conflict of interest, and did not vote or otherwise make a position known (these three simplified to an unknown disposition).


##Attribute Information:
1. Class Name: 2 (democrat, republican)
2. handicapped-infants: 2 (y,n)
3. water-project-cost-sharing: 2 (y,n)
4. adoption-of-the-budget-resolution: 2 (y,n)
5. physician-fee-freeze: 2 (y,n)
6. el-salvador-aid: 2 (y,n)
7. religious-groups-in-schools: 2 (y,n)
8. anti-satellite-test-ban: 2 (y,n)
9. aid-to-nicaraguan-contras: 2 (y,n)
10. mx-missile: 2 (y,n)
11. immigration: 2 (y,n)
12. synfuels-corporation-cutback: 2 (y,n)
13. education-spending: 2 (y,n)
14. superfund-right-to-sue: 2 (y,n)
15. crime: 2 (y,n)
16. duty-free-exports: 2 (y,n)
17. export-administration-act-south-africa: 2 (y,n)



#Source
The dataset can be obtained from the:
https://archive.ics.uci.edu/ml/datasets/Congressional+Voting+Records

#Tasks:
1.	Obtained the dataset
2.	Apply pre-processing operations
3.	Train Adaboost model from scratch and test the model
4.	Train Adaboost model using sklearn
6.	Compare the performance of Adaboost, Random Forest and Decision Trees


## Part 1: Adaboost from Scratch

In [15]:
# Load the libraries
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.preprocessing import MinMaxScaler,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,classification_report
import matplotlib.pyplot as plt

In [16]:
# Load the dataset 
df=pd.read_table('house-votes-84.data',sep=',')
df.head()

Unnamed: 0,republican,n,y,n.1,y.1,y.2,y.3,n.2,n.3,n.4,y.4,?,y.5,y.6,y.7,n.5,y.8
0,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,?
1,democrat,?,y,y,?,y,y,n,n,n,n,y,n,y,y,n,n
2,democrat,n,y,y,n,?,y,n,n,n,n,y,n,y,n,n,y
3,democrat,y,y,y,n,y,y,n,n,n,n,y,?,y,y,y,y
4,democrat,n,y,y,n,y,y,n,n,n,n,n,n,y,y,y,y


In [18]:
# Preprocessing
# Encoding categorical variables (if any)
# Feature Scaling
# Filling missing values (if any)
cols=df.columns
enc=LabelEncoder()
for i in range(0,len(df.columns)):
    df.iloc[:,i]=enc.fit_transform(df.iloc[:,i])
from sklearn.impute import SimpleImputer
imp=SimpleImputer(missing_values=3,strategy='mean')
df=pd.DataFrame(imp.fit_transform(df),columns=cols)
df.head()

Unnamed: 0,republican,n,y,n.1,y.1,y.2,y.3,n.2,n.3,n.4,y.4,?,y.5,y.6,y.7,n.5,y.8
0,1.0,1.0,2.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0,0.0
1,0.0,0.0,2.0,2.0,0.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0,1.0,1.0
2,0.0,1.0,2.0,2.0,1.0,0.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0
3,0.0,2.0,2.0,2.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,0.0,2.0,2.0,2.0,2.0
4,0.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0


In [19]:
df=df.drop_duplicates()
df

Unnamed: 0,republican,n,y,n.1,y.1,y.2,y.3,n.2,n.3,n.4,y.4,?,y.5,y.6,y.7,n.5,y.8
0,1.0,1.0,2.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0,0.0
1,0.0,0.0,2.0,2.0,0.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0,1.0,1.0
2,0.0,1.0,2.0,2.0,1.0,0.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0
3,0.0,2.0,2.0,2.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,0.0,2.0,2.0,2.0,2.0
4,0.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
429,1.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,1.0,2.0
430,0.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,2.0
431,1.0,1.0,0.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0,2.0
432,1.0,1.0,1.0,1.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,1.0,2.0


In [27]:
# Divide the dataset to training and testing set
X=df.drop(columns='republican')
y=df['republican']
y.replace(to_replace=0,value=-1,inplace=True)
y.replace(to_replace=1,value=1,inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [41]:
y_train

36     1.0
391    1.0
290   -1.0
218   -1.0
316   -1.0
      ... 
233   -1.0
84    -1.0
130   -1.0
350    1.0
126   -1.0
Name: republican, Length: 238, dtype: float64

In [77]:
np.sum(np.where([y_train==1.0,model.predict(X_train)==1.0])[0]==1)

92

In [85]:
# Implement Adaboost model from scratch
# Adaboost consist of stumps which can be created using builtin decision trees in sklearn
# Stump can be trained by keeping the max_depth as 1
n=X_train.shape[0]
initw=np.zeros((100, n))
stumps=np.zeros((100,),dtype=object)
stumpsw=np.zeros((100,))
errors=np.zeros((100,))
initw[0]=np.ones((n,))/n
for i in range(100):
    weights=initw[i]
    stump=DecisionTreeClassifier(max_depth=1)
    stump=stump.fit(X_train,y_train,sample_weight=weights)
    stpred=stump.predict(X_train)
    err=weights[(stpred!=y_train)].sum()
    stumpswe=np.log((1-err)/err)/2
    newweights=weights*np.exp(-stumpswe*y_train*stpred)
    newweights=newweights/newweights.sum()
    if(i+1<100):
        initw[i+1]=newweights
    stumps[i]=stump
    stumpsw[i]=stumpswe
    errors[i]=err
    

In [93]:
# Train the model and test the model
stumppreds = np.array([stump.predict(X_train) for stump in stumps])
y_pred_train=np.sign(np.dot(stumpsw, stumppreds))
stumppreds = np.array([stump.predict(X_test) for stump in stumps])
y_pred_test=np.sign(np.dot(stumpsw, stumppreds))

In [94]:
# Evaluate the results using accuracy, precision, recall and f-measure
print(classification_report(y_train,y_pred_train))
print()
print(classification_report(y_test,y_pred_test))

              precision    recall  f1-score   support

        -1.0       1.00      1.00      1.00       152
         1.0       1.00      1.00      1.00        86

    accuracy                           1.00       238
   macro avg       1.00      1.00      1.00       238
weighted avg       1.00      1.00      1.00       238


              precision    recall  f1-score   support

        -1.0       0.93      0.95      0.94        73
         1.0       0.86      0.83      0.85        30

    accuracy                           0.91       103
   macro avg       0.90      0.89      0.89       103
weighted avg       0.91      0.91      0.91       103



## Part 2: Adaboost using Sklearn

In [9]:
# Use the preprocessed dataset here
X_train.shape

(238, 16)

In [25]:
# Train the Adaboost Model using builtin Sklearn Dataset

model=AdaBoostClassifier()
model.fit(X_train,y_train)

AdaBoostClassifier()

In [26]:
# Test the model with testing set and print the accuracy, precision, recall and f-measure
print(classification_report(y_test,model.predict(X_test)))

              precision    recall  f1-score   support

         0.0       0.93      0.93      0.93        73
         1.0       0.83      0.83      0.83        30

    accuracy                           0.90       103
   macro avg       0.88      0.88      0.88       103
weighted avg       0.90      0.90      0.90       103



In [30]:
# Play with parameters such as
# number of decision trees
# Criterion for splitting
# Max depth
# Minimum samples per split and leaf


16


## Part 3: Compare the models

In [None]:
# Train Adaboost, Random Forest and Decision tree models from sklearn



In [None]:
# Run the model on testing set



In [None]:
# Compare their accuracy, precision, recall and f-measure

