In [None]:
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

# Importo dataset

In [None]:
folder = "C:/Users/Lenovo/Documents/GitHub/Datasets/toxic_comments/"

In [None]:
train = pd.read_csv(folder+"train.csv")
test = pd.read_csv(folder+"test.csv")
test_labels = pd.read_csv(folder+"test_labels.csv")
submission = pd.read_csv(folder+"sample_submission.csv")

In [None]:
train[:10]

In [None]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values

In [None]:
y[:10]

# Divido entre train y valid

In [None]:
X_train, X_valid, Y_train, Y_valid = train_test_split(train, y, test_size = 0.1)

In [None]:
X_train.shape

In [None]:
raw_text_train = X_train["comment_text"].str.lower()
raw_text_valid = X_valid["comment_text"].str.lower()
raw_text_test = test["comment_text"].str.lower()

In [None]:
print(raw_text_train[0:10]) # Recordar que train_test_split hace shuffle 

In [None]:
Y_train[:10]

# Armo matriz de features

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

max_features = 100
psa_features = 60

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=max_features,
                                   stop_words='english')

tfidf_matrix_train = tfidf_vectorizer.fit_transform(raw_text_train)

In [None]:
top_10 = np.argsort(tfidf_matrix_train.sum(axis=0))[0,::-1][0,:10].tolist()[0]

In [None]:
feature_names = np.array(tfidf_vectorizer.get_feature_names())
feature_names[np.array(top_10)]

# Probando con PCA-Random Projections

In [None]:
#Importo de sklearn.decomposition los metodos necesarios.
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA

In [None]:
#Retorna error -> Paso a utilizar TruncateSVD !!!!!!!!!
#pca_apply = PCA(n_components = 60)
#new_matrix_train = pca_apply.fit_transform(tfidf_matrix_train)

In [None]:
#Aplico TruncatedSVD a la matriz de features.
pca_apply = TruncatedSVD(n_components = psa_features)
new_matrix_train = pca_apply.fit_transform(tfidf_matrix_train)

In [None]:
#Asigno la matriz despues de aplicar el PCA para reducir los "features".
from scipy import sparse
new_sparse_matrix_train = sparse.csr_matrix(new_matrix_train) #Transformo de Numpy Array a Sparse matrix de vuelta.
dense_matrix_train = new_sparse_matrix_train.todense()

#Paso viejo en que usaba sin PCA.
#dense_matrix_train = tfidf_matrix_train.todense()

dense_matrix_train.shape, Y_train.shape

In [None]:
tfidf_matrix_valid = tfidf_vectorizer.transform(raw_text_valid)

In [None]:
#Me quedo con la cantidad de features que tiene luego de aplicar PSA.
dense_matrix_valid = tfidf_matrix_valid.todense()[:,:psa_features]
#dense_matrix_valid = tfidf_matrix_valid.todense()
dense_matrix_valid.shape

# Modelo - "x" capas densas

In [None]:
import tensorflow as tf
from keras import backend as K
#-----------------------------------------------------------------------------------------------------------------------------------------------------
# AUC for a binary classifier
def auc(y_true, y_pred):   
    ptas = tf.stack([binary_PTA(y_true,y_pred,k) for k in np.linspace(0, 1, 30)],axis=0)
    pfas = tf.stack([binary_PFA(y_true,y_pred,k) for k in np.linspace(0, 1, 30)],axis=0)
    pfas = tf.concat([tf.ones((1,)) ,pfas],axis=0)
    binSizes = -(pfas[1:]-pfas[:-1])
    s = ptas*binSizes
    return K.sum(s, axis=0)

#-----------------------------------------------------------------------------------------------------------------------------------------------------
# PFA, prob false alert for binary classifier
def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)):
    y_pred = K.cast(y_pred >= threshold, 'float32')
    # N = total number of negative labels
    N = K.sum(1 - y_true)
    # FP = total number of false alerts, alerts from the negative class labels
    FP = K.sum(y_pred - y_pred * y_true)    
    return FP/N
#-----------------------------------------------------------------------------------------------------------------------------------------------------
# P_TA prob true alerts for binary classifier
def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)):
    y_pred = K.cast(y_pred >= threshold, 'float32')
    # P = total number of positive labels
    P = K.sum(y_true)
    # TP = total number of correct alerts, alerts from the positive class labels
    TP = K.sum(y_pred * y_true)    
    return TP/P

In [None]:
from keras.models import Sequential
from keras import optimizers
from keras.layers.core import Dense, Activation
from helper import PlotLosses
from keras.callbacks import ModelCheckpoint
from keras.initializers import RandomNormal
from keras import regularizers

In [None]:
input_features = dense_matrix_train.shape[1]
output_size = Y_train.shape[1]
hidden_units = 300
lambd = 0.001
model_sig_nn = Sequential()
model_sig_nn.add(Dense(output_size,input_dim=input_features, kernel_regularizer=regularizers.l2(lambd), name="Capa_Oculta_1"))
model_sig_nn.add(Dense(output_size,input_dim=input_features, kernel_regularizer=regularizers.l2(lambd), name="Capa_Oculta_2"))
model_sig_nn.add(Activation('sigmoid', name="output")) 
model_sig_nn.summary()


lr = 0.01 
batch_size = 1024
epochs = 10

#selectedOptimizer = optimizers.SGD(lr=lr)
selectedOptimizer = optimizers.adam(lr=lr)

# Lo compilo, notar que en vez de binary_crossentropy va categorical_crossentropy
model_sig_nn.compile(loss = 'binary_crossentropy', optimizer=selectedOptimizer, 
                     metrics=['accuracy']) #auc

In [None]:
from keras.callbacks import ModelCheckpoint
checkpointer = ModelCheckpoint(filepath='basic_model_best.hdf5', verbose=1, save_best_only=True)

plot_losses = PlotLosses(plot_interval=1, 
                         evaluate_interval=5, 
                         x_val=dense_matrix_valid, 
                         y_val_categorical=Y_valid)
history = model_sig_nn.fit(dense_matrix_train, 
          Y_train, 
          batch_size = batch_size,
          epochs=epochs, 
          verbose=1, 
          validation_data=(dense_matrix_valid, Y_valid), 
          callbacks=[plot_losses, checkpointer],
         )

# Evaluo valid

In [None]:
#model_sig_nn.load_weights('basic_model_best.hdf5')

In [None]:
tfidf_matrix_valid.shape, Y_valid.shape

In [None]:
pred_valid = model_sig_nn.predict(dense_matrix_valid, verbose = 1)
pred_train = model_sig_nn.predict(dense_matrix_train, verbose = 1)

In [None]:
model_sig_nn.evaluate(dense_matrix_valid, Y_valid)

# ROC Curve

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from scipy import interp
from itertools import cycle

print(roc_auc_score(Y_train, pred_train, average='macro'))
print(roc_auc_score(Y_valid, pred_valid, average='macro'))

In [None]:
fpr = dict()
tpr = dict()
roc_auc = dict()
n_classes = Y_valid.shape[1]
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(Y_valid[:, i], pred_valid[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
    
fpr["micro"], tpr["micro"], _ = roc_curve(Y_valid.ravel(), pred_valid.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

In [None]:
from matplotlib import pyplot as plt
# Compute macro-average ROC curve and ROC area
lw = 2
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves
plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=4)

colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=lw,
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Some extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
plt.show()

http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html

**True Positive Rate**:   
The number of times your system was able to classify the positives as positives. 

True positive rate = Correctly Classified Positives/(Correctly Classified as Positives+ Falsely Classified as Negatives)

**False Positive Rate**:  
The number of times your system classified a negative as a positive divided by the total  actual negative instances.


False positive rate = Incorrectly Classified as Positives/(Incorrectly Classified as Positives+ Correctly classified as Negatives )

https://en.wikipedia.org/wiki/Receiver_operating_characteristic

https://datascience.stackexchange.com/questions/15989/micro-average-vs-macro-average-performance-in-a-multiclass-classification-settin/16001

Macro-average: Calcula el score de cada clase y luego promedia  
Micro-average: Suma y luego calcula el score

Micro-average se considera mejor cuando hay desbalce en las clases

# Predict for test

In [None]:
tfidf_matrix_test = tfidf_vectorizer.transform(raw_text_test)

In [None]:
#Me quedo con la cantidad de features que tiene luego de aplicar PSA.
dense_matrix_test = tfidf_matrix_valid.todense()[:,:psa_features]
#dense_matrix_test = tfidf_matrix_valid.todense()

In [None]:
pred = model_sig_nn.predict(dense_matrix_test, verbose=1)

In [None]:
1*(pred[0:10]>0.5)

In [None]:
#Comentado porque tira error, sera porque son menos de 100 ?
#submission[list_classes] = pred
#submission.to_csv("submission_early_stop_2_epochs.csv", index = False)