In [121]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, roc_auc_score, recall_score, precision_score, average_precision_score, f1_score, classification_report, accuracy_score, plot_roc_curve, plot_precision_recall_curve, plot_confusion_matrix

In [122]:
train_data = pd.read_csv("https://raw.githubusercontent.com/dphi-official/Datasets/master/tax_payers/train_set_label.csv")

In [123]:
print('shape: ', train_data.shape)
train_data.head()

shape:  (803, 11)


Unnamed: 0.1,Unnamed: 0,HHI,HHDL,Married,CollegGrads,AHHAge,Cars,Filed in 2017,Filed in 2016,Filed in 2015,PoliticalParty
0,61,183414,-12402,1,2,28,1,0,0,1,Democrat
1,501,129786,700251,2,5,63,3,0,0,0,Republican
2,658,268343,493411,2,4,44,1,1,0,1,Independent
3,703,290506,-433408,1,1,28,5,1,0,1,Republican
4,702,90108,907135,1,3,57,5,1,1,0,Democrat


In [124]:
test_data = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/tax_payers/test_set_label.csv')

In [125]:
print('shape: ', test_data.shape)
test_data.head()

shape:  (201, 10)


Unnamed: 0.1,Unnamed: 0,HHI,HHDL,Married,CollegGrads,AHHAge,Cars,Filed in 2017,Filed in 2016,Filed in 2015
0,17,253274,-633752,2,1,94,3,0,1,1
1,590,47107,322850,0,1,30,2,1,1,0
2,224,111874,300583,0,2,46,3,0,1,1
3,960,96670,107419,0,3,77,5,0,0,0
4,57,128669,341273,0,0,92,4,1,1,1


In [126]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 803 entries, 0 to 802
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Unnamed: 0      803 non-null    int64 
 1   HHI             803 non-null    int64 
 2   HHDL            803 non-null    int64 
 3   Married         803 non-null    int64 
 4   CollegGrads     803 non-null    int64 
 5   AHHAge          803 non-null    int64 
 6   Cars            803 non-null    int64 
 7   Filed in 2017   803 non-null    int64 
 8   Filed in 2016   803 non-null    int64 
 9   Filed in 2015   803 non-null    int64 
 10  PoliticalParty  803 non-null    object
dtypes: int64(10), object(1)
memory usage: 69.1+ KB


In [127]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201 entries, 0 to 200
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   Unnamed: 0     201 non-null    int64
 1   HHI            201 non-null    int64
 2   HHDL           201 non-null    int64
 3   Married        201 non-null    int64
 4   CollegGrads    201 non-null    int64
 5   AHHAge         201 non-null    int64
 6   Cars           201 non-null    int64
 7   Filed in 2017  201 non-null    int64
 8   Filed in 2016  201 non-null    int64
 9   Filed in 2015  201 non-null    int64
dtypes: int64(10)
memory usage: 15.8 KB


In [128]:
train_data = pd.get_dummies(train_data, columns = ['Married'])
train_data = train_data.drop('Unnamed: 0', axis = 1)
train_data.head()

Unnamed: 0,HHI,HHDL,CollegGrads,AHHAge,Cars,Filed in 2017,Filed in 2016,Filed in 2015,PoliticalParty,Married_0,Married_1,Married_2
0,183414,-12402,2,28,1,0,0,1,Democrat,0,1,0
1,129786,700251,5,63,3,0,0,0,Republican,0,0,1
2,268343,493411,4,44,1,1,0,1,Independent,0,0,1
3,290506,-433408,1,28,5,1,0,1,Republican,0,1,0
4,90108,907135,3,57,5,1,1,0,Democrat,0,1,0


In [129]:
test_data = pd.get_dummies(test_data, columns = ['Married'])
test_data = test_data.drop('Unnamed: 0', axis = 1)
test_data.head()

Unnamed: 0,HHI,HHDL,CollegGrads,AHHAge,Cars,Filed in 2017,Filed in 2016,Filed in 2015,Married_0,Married_1,Married_2
0,253274,-633752,1,94,3,0,1,1,0,0,1
1,47107,322850,1,30,2,1,1,0,1,0,0
2,111874,300583,2,46,3,0,1,1,1,0,0
3,96670,107419,3,77,5,0,0,0,1,0,0
4,128669,341273,0,92,4,1,1,1,1,0,0


In [130]:
features = train_data.drop('PoliticalParty', axis = 1)
features.head()

Unnamed: 0,HHI,HHDL,CollegGrads,AHHAge,Cars,Filed in 2017,Filed in 2016,Filed in 2015,Married_0,Married_1,Married_2
0,183414,-12402,2,28,1,0,0,1,0,1,0
1,129786,700251,5,63,3,0,0,0,0,0,1
2,268343,493411,4,44,1,1,0,1,0,0,1
3,290506,-433408,1,28,5,1,0,1,0,1,0
4,90108,907135,3,57,5,1,1,0,0,1,0


In [131]:
X = features
y = train_data['PoliticalParty']
print(X.shape, y.shape)

(803, 11) (803,)


In [132]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.1 ,random_state = 10)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(722, 11) (81, 11) (722,) (81,)


In [133]:
#LogisticRegression

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score

LogReg = LogisticRegression(random_state = 11)
Log_train = LogReg.fit(X_train,y_train)
Log_train

LogisticRegression(random_state=11)

In [134]:
Log_preds = LogReg.predict(X_test)
print(accuracy_score(y_test, Log_preds))

0.2839506172839506


In [135]:
#KNeighbors classifier

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn_train = knn.fit(X_train, y_train)
knn_train

KNeighborsClassifier()

In [136]:
knn_preds = knn.predict(X_test)
print(accuracy_score(y_test, knn_preds))

0.2962962962962963


In [137]:
#random forest

from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(random_state = 101)
train_forest = forest.fit(X_train, y_train)
train_forest

RandomForestClassifier(random_state=101)

In [138]:
rf_preds = forest.predict(X_test)
print(accuracy_score(y_test, rf_preds))

0.4444444444444444


In [140]:
test_predictions = forest.predict(test_data)

In [141]:
#decision tree

from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state = 100)
train_tree = tree.fit(X_train, y_train)
train_tree

DecisionTreeClassifier(random_state=100)

In [142]:
tree_preds = tree.predict(X_test)
print(accuracy_score(y_test, tree_preds))

0.32098765432098764


In [143]:
#feature selection using BorutaPy
from boruta import BorutaPy
boruta_selector = BorutaPy(forest, n_estimators='auto', verbose=2, random_state=1)

In [144]:
boruta_selector.fit(np.array(X_train), np.array(y_train))

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	11
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	11
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	11
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	11
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	11
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	11
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	11
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	2
Rejected: 	9
Iteration: 	9 / 100
Confirmed: 	0
Tentative: 	2
Rejected: 	9
Iteration: 	10 / 100
Confirmed: 	0
Tentative: 	2
Rejected: 	9
Iteration: 	11 / 100
Confirmed: 	0
Tentative: 	2
Rejected: 	9
Iteration: 	12 / 100
Confirmed: 	0
Tentative: 	2
Rejected: 	9
Iteration: 	13 / 100
Confirmed: 	0
Tentative: 	2
Rejected: 	9
Iteration: 	14 / 100
Confirmed: 	0
Tentative: 	2
Rejected: 	9
Iteration: 	15 / 100
Confirmed: 	0
Tentative: 	2
Rejected: 	9
Iteration: 	16 / 100
Confirmed: 	0
Tentative: 	2
Rejected: 	9
Iteration:

BorutaPy(estimator=RandomForestClassifier(n_estimators=14,
                                          random_state=RandomState(MT19937) at 0x18EDA0E3140),
         n_estimators='auto',
         random_state=RandomState(MT19937) at 0x18EDA0E3140, verbose=2)

In [145]:
print('significant features: ', boruta_selector.support_)

significant features:  [False False False False False False False False False False False]


In [146]:
print('ranking of significant features: ', boruta_selector.ranking_)

ranking of significant features:  [ 2  3  5  4  5  7  9  7 11 10 12]


In [147]:
print('no. of important features: ', boruta_selector.n_features_)

no. of important features:  0


In [148]:
rf_preds

array(['Independent', 'Independent', 'Democrat', 'Independent',
       'Republican', 'Independent', 'Independent', 'Independent',
       'Republican', 'Republican', 'Independent', 'Independent',
       'Independent', 'Democrat', 'Republican', 'Democrat', 'Republican',
       'Republican', 'Republican', 'Democrat', 'Democrat', 'Independent',
       'Democrat', 'Republican', 'Democrat', 'Republican', 'Republican',
       'Republican', 'Independent', 'Independent', 'Independent',
       'Independent', 'Democrat', 'Independent', 'Republican', 'Democrat',
       'Independent', 'Democrat', 'Independent', 'Republican',
       'Republican', 'Independent', 'Independent', 'Democrat', 'Democrat',
       'Democrat', 'Independent', 'Democrat', 'Republican', 'Democrat',
       'Republican', 'Independent', 'Republican', 'Independent',
       'Democrat', 'Democrat', 'Independent', 'Democrat', 'Independent',
       'Independent', 'Independent', 'Democrat', 'Democrat', 'Republican',
       'Republican',

In [160]:
a = list(test_predictions)
dict = {'prediction' : a}
pred = pd.DataFrame(dict, columns = ['prediction'])
pred.head()

Unnamed: 0,prediction
0,Independent
1,Democrat
2,Democrat
3,Democrat
4,Republican


In [161]:
pred.to_csv("C:\\Users\\dines\\OneDrive\\Documents\\data projects\\submission_1.csv", index = False)

In [164]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_train = scaler.fit_transform(features)
train_scaled = pd.DataFrame(scaled_train, columns = features.columns)
scaled_test = scaler.fit_transform(test_data)
test_scaled = pd.DataFrame(scaled_test, columns = test_data.columns)

In [165]:
print(train_scaled.shape, test_scaled.shape)

(803, 11) (201, 11)


In [167]:
X1 = train_scaled
y1 = train_data['PoliticalParty']

X1_train, X1_test, y1_train, y1_test = train_test_split(X1,y1, test_size = 0.1, random_state = 200)


In [168]:
train1_rf = forest.fit(X1_train, y1_train)
train1_rf

RandomForestClassifier(n_estimators=14,
                       random_state=RandomState(MT19937) at 0x18EDA0E3140)

In [169]:
preds_rf1 = forest.predict(X1_test)
accuracy_score(y1_test, preds_rf1)

0.2716049382716049

In [170]:
preds_rf1

array(['Independent', 'Democrat', 'Democrat', 'Republican', 'Republican',
       'Republican', 'Democrat', 'Independent', 'Democrat', 'Independent',
       'Independent', 'Democrat', 'Republican', 'Democrat', 'Republican',
       'Republican', 'Democrat', 'Democrat', 'Republican', 'Democrat',
       'Democrat', 'Democrat', 'Democrat', 'Democrat', 'Republican',
       'Independent', 'Democrat', 'Independent', 'Republican',
       'Independent', 'Independent', 'Democrat', 'Republican',
       'Republican', 'Democrat', 'Democrat', 'Republican', 'Independent',
       'Democrat', 'Independent', 'Independent', 'Democrat', 'Democrat',
       'Democrat', 'Independent', 'Republican', 'Democrat', 'Independent',
       'Democrat', 'Democrat', 'Independent', 'Democrat', 'Independent',
       'Independent', 'Republican', 'Democrat', 'Democrat', 'Democrat',
       'Democrat', 'Republican', 'Republican', 'Democrat', 'Democrat',
       'Democrat', 'Independent', 'Democrat', 'Democrat', 'Democrat',
   