In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pandas_profiling
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import RandomizedSearchCV

In [None]:
data = pd.read_csv("C:\REPOS\Kaggle\CongressionalVotingID.shuf.lrn.csv",na_values=['unknown'])
data.drop('ID',axis=1,inplace=True) #DROP ID
data.replace(('y', 'n'), (1, 0), inplace=True) #Replace Y = 1 ; N = 0
data.head() #Y = 1; N = 0; unknown = NaN

In [None]:
graphs = data.drop('class', axis=1).columns 
z = 0
fig, axes = plt.subplots(4, 4, figsize=(30, 15))
for i in range(4):
    #graphs = np.where(y_train == i)[0] # seleciona as imagens de cada tipo para ser plotada no grafico.
    for j in range(4):     
        sns.countplot(x = graphs[z+j], hue='class', data=data, palette='BuPu', ax=axes[i][j])
        axes[i][j].set_yticks([])
    z = z + j + 1
plt.show()

In [None]:
df = data.replace(('democrat', 'republican'), (0, 1), inplace=False)

In [None]:
#Creates Report dont RUN
CongressionalVotes = pandas_profiling.ProfileReport(df)
CongressionalVotes.to_file('CongressionalVotes.html')
CongressionalVotes

In [None]:
def evaluate_model(X, y, model):
    K = 5
    R = 3

    cv = RepeatedStratifiedKFold(n_splits=K, n_repeats=R, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    return scores

def get_models():
    models, names = list(), list()
    # SVM
    models.append(LogisticRegression())
    names.append('LG')
    
    models.append(KNeighborsClassifier())
    names.append('KNN')
    
    models.append(RandomForestClassifier(n_estimators=1000))
    names.append('RF')
    
    models.append(ExtraTreesClassifier(n_estimators=1000))
    names.append('ET')
    
    return models, names

def evaluate(model, test_features, test_labels):
    probs_votos = model.predict_proba(test_features)
    accuracy = roc_auc_score(test_labels, probs_votos[:,1])
    return accuracy

imp = SimpleImputer(strategy="mean")  
>LG 0.942 (0.025)  
>KNN 0.911 (0.031)  
>RF 0.947 (0.022)  
>ET 0.943 (0.029)  

imp = SimpleImputer(strategy="most_frequent")  
>LG 0.937 (0.028)  
>KNN 0.917 (0.039)  
>RF 0.950 (0.024)  
>ET 0.945 (0.031)  

imp = SimpleImputer(strategy="median")  
>LG 0.937 (0.028)  
>KNN 0.917 (0.039)  
>RF 0.948 (0.024)  
>ET 0.947 (0.031)  

imp = SimpleImputer(strategy="constant") No fill  
>LG 0.940 (0.028)  
>KNN 0.905 (0.050)  
>RF 0.946 (0.022)  
>ET 0.937 (0.031)  

dropna()  
>LG 0.941 (0.052)  
>KNN 0.905 (0.064)  
>RF 0.956 (0.042)  
>ET 0.949 (0.041)  


LG: 0.942 mean   
KNN: 0.917 most_frequent  
RF: 0.956 drop_na  
ET: 0.949 drop_na  



In [None]:
#IMPUTED MODELS
df_imp = df
y = df_imp['class'].values
X = df_imp.drop('class', axis=1).values


imp = SimpleImputer(strategy="most_frequent")
X = imp.fit_transform(X)

models, names = get_models()

results = list()
print("Model", "Mean", "Std")

for i in range(len(models)):
    scores = evaluate_model(X, y, models[i])
    results.append(scores)
    print('>%s %.3f (%.3f)' % (names[i], np.mean(scores), np.std(scores)))

plt.boxplot(results, labels=names, showmeans=True)
plt.show()

In [None]:
#DROP NA
df_na = df.dropna()
y = df_na['class'].values
X = df_na.drop('class', axis=1).values

models, names = get_models()

results = list()
print("Model", "Mean", "Std")

for i in range(len(models)):
    scores = evaluate_model(X, y, models[i])
    results.append(scores)
    print('>%s %.3f (%.3f)' % (names[i], np.mean(scores), np.std(scores)))

plt.boxplot(results, labels=names, showmeans=True)
plt.show()

In [None]:
y_train = df_na['class'].values
X_train = df_na.drop('class', axis=1).values

test = pd.read_csv("C:\REPOS\Kaggle\CongressionalVotingID.shuf.tes.csv",na_values=['unknown'])
test.drop('ID',axis=1,inplace=True) #DROP ID
test.replace(('y', 'n'), (1, 0), inplace=True) #Replace Y = 1 ; N = 0
test.head() #Y = 1; N = 0; unknown = NaN
df_test = test.replace(('democrat', 'republican'), (0, 1), inplace=False)

df_na = df.dropna()

X_test = df_test['class'].values
y_test = df_test.drop('class', axis=1).values
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21, stratify=y)

In [None]:
#grid search do random forest
# definição de cada paramentro
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

# Criação do grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

In [None]:
rf = RandomForestClassifier(random_state = 42)
# avaliação cruzada utilizando 3 folds, 
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,
                              n_iter = 100, scoring='neg_mean_absolute_error', 
                              cv = 3, verbose=2, random_state=42, n_jobs=-1,
                              return_train_score=True)

# Fit
rf_random.fit(X_train, y_train)
print('Melhores parametros:',rf_random.best_params_)
# seleção do melhor modelo e avaliação da acuracia do modelo 
best_random = rf_random.best_estimator_
#fazendo a predição nos arquivos de teste
y_pred = best_random.predict(X_test)
# avaliação da acuracia do modelo
acc_vote = accuracy_score(y_test, y_pred)
print("acuracia: ",acc_vote)

In [None]:
print("AUC:" ,evaluate(best_random, X_test, y_test))