In [1]:
import numpy
from numpy import genfromtxt

# Abrindo o CSV
data = genfromtxt('data_limpo.csv', delimiter=',')
print(data.shape)

(48564, 169)


In [2]:
# Pegando apenas as entradas e saidas

genero = data[1:, 164] # Genero
acuracia = data[1:, 165] # Acuracia

data = data[1: , 0:163] # Perguntas

(nData, nFeatures) = data.shape

print('nData: ', nData)
print('nFeatures: ', nFeatures)

('nData: ', 48563)
('nFeatures: ', 163)


In [3]:
# Pegando apenas os 16 resultados

# Abrindo o CSV
data16 = genfromtxt('dataset/output_limpo_semINTMAX.csv', delimiter=',')
print(data16)

# Sobrescreve as perguntas com os 16 resultados
data = data16

IOError: dataset/output_limpo_semINTMAX.csv not found.

In [4]:
# Separando em dados de teste e treino
from sklearn.model_selection import train_test_split

(dataTrain, dataTest, yTrain, yTest, acuraciaTrain, acuraciaTest) = train_test_split(data, genero, acuracia, test_size = 0.1, shuffle=True)

print(dataTrain.shape, yTrain.shape)
print(dataTest.shape, yTest.shape)

((43706, 163), (43706,))
((4857, 163), (4857,))


In [5]:
# PCA
from sklearn.decomposition import PCA

pca = PCA(n_components=0.9, svd_solver='full')
pca.fit(dataTrain)
dataTrain = pca.transform(dataTrain)
dataTest = pca.transform(dataTest)

print(dataTrain.shape, dataTest.shape)

((43706, 118), (4857, 118))


In [6]:
# Regressão Logistica
from sklearn.linear_model import LogisticRegression

md = LogisticRegression(
            penalty='l2',
            dual=False,
            tol=1e-4,
            C=1.0,
            fit_intercept=True,
            intercept_scaling=1,
            class_weight=None,
            random_state=None,
            solver='saga',
            max_iter=10000,
            multi_class='multinomial',
            verbose=1,
            warm_start=False,
            n_jobs=-1    
        )

md.fit(dataTrain, yTrain)
print('Train: ', md.score(dataTrain, yTrain))
print('Test:  ', md.score(dataTest, yTest))

convergence after 31 epochs took 5 seconds
('Train: ', 0.79201940237038393)
('Test:  ', 0.79761169446160185)


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    5.2s finished


In [None]:
# Linear SVC
from sklearn.svm import LinearSVC

md = LinearSVC(
            penalty='l2',
            loss='squared_hinge',
            dual=True,
            tol=1e-4,
            C=1.0,
            multi_class='ovr',
            fit_intercept=True,
            intercept_scaling=1,
            verbose=1,
            random_state=None,
            max_iter=100000
        )

md.fit(dataTrain, yTrain)
print('Train: ', md.score(dataTrain, yTrain))
print('Test: ', md.score(dataTest, yTest))

[LibLinear]

In [None]:
# Cria uma rede neural
from sklearn.neural_network import MLPClassifier

md = MLPClassifier(
            hidden_layer_sizes = (100),
#             activation = 'identity',
#             solver = 'sgd',
            alpha = 1e-6,
#             batch_size = 'auto',
            learning_rate = 'adaptive',
#             learning_rate_init = 0.001,
#             power_t = 0.5,
#             max_iter = 10000,
#             shuffle = True,
#             random_state = None,
#             tol = 1e-4,
            verbose = True,
#             warm_start = False,
#             momentum = 0.9,
#             nesterovs_momentum = True,
#             early_stopping = False,
#             validation_fraction = 0.15    
        )

md.fit(dataTrain, yTrain)
print('Train: ', md.score(dataTrain, yTrain))
print('Test:  ', md.score(dataTest, yTest))

In [None]:
# Separa as entradas de gênero com predições corretas em diferentes arquivos

# Predizemos nossos dados
yPred = md.predict(dataTest)

(nDataTest, nFeaturesTest) = dataTest.shape

# Para generos 1 e 2
for idd in range(1, 3):
    # Criamos uma matriz
    dados_copia = numpy.zeros(dataTest.shape)
    
    # Indices
    i=0
    j=dataTest.shape[0]-1
    p=0
    
    # Preenchemos a matriz de tal forma que os dados com predição correta fiquem no começo, e os dados com predição errada fiquem no final
    while (i<=j):
        if (int(yPred[p]) == idd):
            dados_copia[i] = dataTest[p]
            i = i+1
        else:
            dados_copia[j] = dataTest[p]
        j = j-1

        p = p+1

    # Pegamos apenas os dados que foram preditos corretamente
    dados_copia = dados_copia[0:i][:]


    # Salva tais dados em um arquivo CSV
    numpy.savetxt('generos/'+str(igual)+'.csv', dados_copia, delimiter=',')

In [None]:
# Matriz de confusão

from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import itertools

# Primeiro, os dados tem os valores de cada tem: yTest
# Agora vou montar meu vetor de predições: yPredict

yPred = md.predict(dataTest)
classes = ['Masc', 'Fem']

matrix = confusion_matrix(yTest, yPred)

def plot_confusion_matrix(cm, classes, normalize=True, title='Confusion matrix', cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, numpy.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = numpy.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

    
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(matrix,classes=classes, title='Confusion matrix, without normalization')

