In [78]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [79]:
df = pd.read_csv('house-votes.csv')

In [80]:
df[df == '?'] = np.nan

In [81]:
print(df.isnull().sum())

party                   0
 infants               12
 water                 48
 budget                11
 physician             11
 salvador              15
religious              11
 satellite             14
 aid                   15
 missile               22
 immigration            7
 synfuels              21
education              31
 superfund             25
 crime                 17
 duty_free_exports     28
 eaa_rsa              104
dtype: int64


In [82]:
print("Shape of Original DataFrame: {}".format(df.shape))

Shape of Original DataFrame: (435, 17)


In [83]:
df_drop = df.dropna()

In [84]:
print("Shape of DataFrame After Dropping All Rows with Missing Values: {}".format(df_drop.shape))

Shape of DataFrame After Dropping All Rows with Missing Values: (232, 17)


In [85]:
y = df['party']
X = df.drop('party', axis=1)

In [97]:
# Setup the Imputation transformer: imp
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [98]:
# Instantiate the SVC classifier: clf
clf = SVC()

In [99]:
steps = [('imputation', imp),
        ('SVM', clf)]

In [100]:
pipeline = Pipeline(steps)

In [101]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [102]:
# Fit the pipeline to the train set
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('imputation',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='most_frequent',
                               verbose=0)),
                ('SVM',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='rbf', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

In [103]:
y_pred = pipeline.predict(X_test)

In [104]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    democrat       0.98      0.96      0.97        85
  republican       0.94      0.96      0.95        46

    accuracy                           0.96       131
   macro avg       0.96      0.96      0.96       131
weighted avg       0.96      0.96      0.96       131

