# Notebook para probar diferentes modelos de clasificación con TfIdVectorizer

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Not connected to a GPU
Your runtime has 37.8 gigabytes of available RAM

You are using a high-RAM runtime!


Carga del dataset y separación en sinceras e insinceras

In [2]:
#Importación de las librerías NLTK y Gensim

import nltk
import gensim
import pandas as pd
import re
nltk.download('all')

#Cargo el dataset de ejemplo y veo su aspecto
total_questionsdf = pd.read_csv('train.csv')
print (total_questionsdf.shape)

#Selecciono las preguntas insinceras
insincere_questionsdf=total_questionsdf.loc[total_questionsdf['target'] == 1]
print (insincere_questionsdf.shape)

#Selecciono las sinceras
sincere_questionsdf=total_questionsdf.loc[total_questionsdf['target'] == 0].sample(len(insincere_questionsdf))
print (sincere_questionsdf.shape)

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    |   Package bcp47 is already up-to-dat

(1306122, 3)
(80810, 3)
(80810, 3)


## Entrenamiento y validación con un conjunto aleatorio del dataset de entrada, que contiene muchas más opiniones sinceras que insinceras

In [3]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

import pandas as pd
import numpy as np

#Cojo un conjunto al azar del total, que está desequilibrado
from sklearn.feature_extraction.text import TfidfVectorizer
data = []
data_labels = []


questions = total_questionsdf['question_text'].sample(5000).tolist()
tags = total_questionsdf['target'].tolist()

for i in range(len(questions)): 
    data.append(questions[i]) 
    data_labels.append(tags[i])



data_labels = ['SINCERE' if x==0 else 'INSINCERE' for x in data_labels]
#Vectorización
tfidfvectorizer = TfidfVectorizer(analyzer= 'word')

X = tfidfvectorizer.fit_transform(data)
M2 = X.toarray()

#Preparación de los corpus de entrenamiento y test
X_train, X_test, y_train, y_test  = train_test_split(
        M2, 
        data_labels,
        train_size=0.80, 
        random_state=1234)

In [4]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
##Entrenamiento
log_model = classifier.fit(X=X_train, y=y_train)
#Clasificación
y_pred = log_model.predict(X_test)

#Se calculan las métricas de evaluación
lm = metrics.classification_report(y_test, y_pred, labels=list(set(data_labels)))

print(lm)

              precision    recall  f1-score   support

     SINCERE       0.94      1.00      0.97       936
   INSINCERE       0.00      0.00      0.00        64

    accuracy                           0.94      1000
   macro avg       0.47      0.50      0.48      1000
weighted avg       0.88      0.94      0.91      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Generamos un dataset más equilibrado, tomando el mismo número de sinceras que insinceras y aplicamos diferentes criterios de clasificación.

In [5]:
#Vamos a separar los conjuntos de entrenamiento en conjuntos más proporcionados, donde tengamos la mitad de opiniones sinceras y la mitad de insinceras y veamos qué sucede
from nltk import pos_tag, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
muestraInsinceras=insincere_questionsdf.sample(25000)
muestraSinceras=sincere_questionsdf.sample(25000)

questions=muestraInsinceras['question_text'].append(muestraSinceras['question_text']).tolist()
tags = muestraInsinceras['target'].append(muestraSinceras['target']).tolist()

data = []
data_labels = []

for i in range(len(questions)): 
    questions[i]=questions[i].lower()
    questions[i]=re.sub('[^a-zA-Z\']', ' ', questions[i])
    data.append(questions[i]) 
    data_labels.append(tags[i])
    

data_labels = ['SINCERE' if x==0 else 'INSINCERE' for x in data_labels]
#Vectorización. Ojo con la variable "stop_words, porque no mejoramos la precisión.
#Dejamos la versión con stopwords comentada por si se desea hacer pruebas.

#tfidfvectorizer = TfidfVectorizer(analyzer= 'word', stop_words='english')
tfidfvectorizer = TfidfVectorizer(analyzer= 'word')
X = tfidfvectorizer.fit_transform(data)
M2 = X.toarray()

#Preparación de los corpus de entrenamiento y test
X_train, X_test, y_train, y_test  = train_test_split(
        M2, 
        data_labels,
        train_size=0.80, 
        random_state=1234)

### Regresión logística

In [6]:
#Clasificador con regresión logística
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
##Entrenamiento
log_model = classifier.fit(X=X_train, y=y_train)
#Clasificación
y_pred = log_model.predict(X_test)

#Se calculan las métricas de evaluación
lm = metrics.classification_report(y_test, y_pred, labels=list(set(data_labels)))

print(lm)

              precision    recall  f1-score   support

     SINCERE       0.87      0.88      0.87      5034
   INSINCERE       0.87      0.86      0.87      4966

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



### Clasificador bayesiano

In [7]:
#Clasificador bayesiano
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()
#Entrenamiento
bayes_model = classifier.fit(X=X_train, y=y_train)
#Clasificación
y_pred = bayes_model.predict(X_test)

#Se calculan las métricas de evaluación
bm = metrics.classification_report(y_test, y_pred, labels=list(set(data_labels)))

print(bm)

              precision    recall  f1-score   support

     SINCERE       0.92      0.79      0.85      5034
   INSINCERE       0.81      0.93      0.87      4966

    accuracy                           0.86     10000
   macro avg       0.87      0.86      0.86     10000
weighted avg       0.87      0.86      0.86     10000



### Clasificador SVM

In [8]:
#Clasificador SVC
import sklearn.svm

classifier = sklearn.svm.LinearSVC()
#Entrenamiento
svm_model = classifier.fit(X=X_train, y=y_train)
#Clasificación
y_pred = svm_model.predict(X_test)

svm = metrics.classification_report(y_test, y_pred, labels=list(set(data_labels)))

print(svm)

              precision    recall  f1-score   support

     SINCERE       0.87      0.88      0.87      5034
   INSINCERE       0.87      0.87      0.87      4966

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



### Clasificador Random Forest

In [9]:
#Clasificador Random forest
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=10)
#Entrenamiento
rf_model = classifier.fit(X=X_train, y=y_train)
#Clasificación
y_pred = rf_model.predict(X_test)

#Se calculan las métricas de evaluación
rm = metrics.classification_report(y_test, y_pred, labels=list(set(data_labels)))

print(rm)

              precision    recall  f1-score   support

     SINCERE       0.85      0.78      0.81      5034
   INSINCERE       0.79      0.86      0.83      4966

    accuracy                           0.82     10000
   macro avg       0.82      0.82      0.82     10000
weighted avg       0.82      0.82      0.82     10000



## Vamos a Tratar de analizar el impacto de "Trump"

### Generamos un dataset donde reemplazamos "Trump" por "Biden" en el conjunto de validación

In [4]:
#Vamos a generar nuevos datasets para ver qué pasa si cambio Trump por Biden
from sklearn.feature_extraction.text import TfidfVectorizer
muestraInsinceras=insincere_questionsdf.sample(25000)
muestraSinceras=sincere_questionsdf.sample(25000)
#Mezclo sinceras e insinceras
datasetMezclado=muestraInsinceras.append(muestraSinceras).sample(50000)

questions = datasetMezclado['question_text'].tolist()
tags = datasetMezclado['target'].tolist()


data = []
data_labels = []

for i in range(len(questions)): 
    questions[i]=questions[i].lower()
    questions[i]=re.sub('[^a-zA-Z\']', ' ', questions[i])
    data.append(questions[i]) 
    data_labels.append(tags[i])
    

data_labels = ['SINCERE' if x==0 else 'INSINCERE' for x in data_labels]


for i in range(40000, 50000):
  data[i]=data[i].replace("Trump", "Biden")
  data[i]=data[i].replace("trump", "Biden")


#tfidfvectorizer = TfidfVectorizer(analyzer= 'word', stop_words='english')
tfidfvectorizer = TfidfVectorizer(analyzer= 'word')
X = tfidfvectorizer.fit_transform(data)
X = X.toarray()
X_train=X[0:40000]
X_test=X[40001:50000]
y_train=data_labels[0:40000]
y_test=data_labels[40001:50000]


In [6]:
#Clasificador con regresión logística
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
classifier = LogisticRegression()
##Entrenamiento
log_model = classifier.fit(X=X_train, y=y_train)
#Clasificación
y_pred = log_model.predict(X_test)

#Se calculan las métricas de evaluación
lm = metrics.classification_report(y_test, y_pred, labels=list(set(data_labels)))

print(lm)

              precision    recall  f1-score   support

     SINCERE       0.86      0.88      0.87      4978
   INSINCERE       0.87      0.85      0.86      5021

    accuracy                           0.86      9999
   macro avg       0.87      0.87      0.86      9999
weighted avg       0.87      0.86      0.86      9999



### Vamos a tomar un dataset que en todas las frase contenga "Trump". 

De esta manera pretendemos analizar el impacto que tiene Trump en las preguntas en las que aparece el término "Trump". No en todo el dataset. ¿Cómo de preciso es nuestro clasificador cuando aparece "Trump"?

In [8]:
#Ahora vamos a coger solo las preguntas que contienen "Trump", entrenamos y a ver qué pasa.
from sklearn.model_selection import train_test_split
preguntasTrump = total_questionsdf[total_questionsdf['question_text'].str.contains('Trump')]

questions = preguntasTrump['question_text'].tolist()
tags = preguntasTrump['target'].tolist()

data = []
data_labels = []

for i in range(len(questions)): 
    questions[i]=questions[i].lower()
    questions[i]=re.sub('[^a-zA-Z\']', ' ', questions[i])
    data.append(questions[i]) 
    data_labels.append(tags[i])
    

data_labels = ['SINCERE' if x==0 else 'INSINCERE' for x in data_labels]

tfidfvectorizer = TfidfVectorizer(analyzer= 'word')
X = tfidfvectorizer.fit_transform(data)
M2 = X.toarray()

#Preparación de los corpus de entrenamiento y test
X_train, X_test, y_train, y_test  = train_test_split(
        M2, 
        data_labels,
        train_size=0.80, 
        random_state=1234)

Aplicamos Regresión logística

In [9]:
#Clasificador con regresión logística para ver la bondad del predictor utilizando "Trump"
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
##Entrenamiento
log_model = classifier.fit(X=X_train, y=y_train)
#Clasificación
y_pred = log_model.predict(X_test)

#Se calculan las métricas de evaluación
lm = metrics.classification_report(y_test, y_pred, labels=list(set(data_labels)))

print(lm)

              precision    recall  f1-score   support

     SINCERE       0.72      0.79      0.75      1538
   INSINCERE       0.68      0.59      0.64      1174

    accuracy                           0.70      2712
   macro avg       0.70      0.69      0.69      2712
weighted avg       0.70      0.70      0.70      2712



#### Por último vamos a ver cómo impacta que cambiemos "Biden" por "Trump" en el conjunto de validación habiendo hecho un entrenamiento solo de las preguntas que contenían "Trump"

In [11]:
#Vamos a coger 10.000 preguntas para entrenar y el resto del dataset para validar sustituyendo "Trump" por "Biden"
questions = preguntasTrump['question_text'].tolist()
tags = preguntasTrump['target'].tolist()


data = []
data_labels = []

for i in range(len(questions)): 
    questions[i]=questions[i].lower()
    questions[i]=re.sub('[^a-zA-Z\']', ' ', questions[i])
    data.append(questions[i]) 
    data_labels.append(tags[i])
    

data_labels = ['SINCERE' if x==0 else 'INSINCERE' for x in data_labels]

#
for i in range(10000, len(preguntasTrump)):
  data[i]=data[i].replace("Trump", "Biden")
  data[i]=data[i].replace("trump", "Biden")


#tfidfvectorizer = TfidfVectorizer(analyzer= 'word', stop_words='english')
tfidfvectorizer = TfidfVectorizer(analyzer= 'word')
X = tfidfvectorizer.fit_transform(data)
X = X.toarray()
X_train=X[0:10000]
X_test=X[10001:len(preguntasTrump)]
y_train=data_labels[0:10000]
y_test=data_labels[10001:len(preguntasTrump)]

Se aplica regresión logística

In [12]:
#Clasificador con regresión logística para ver la bondad del predictor utilizando "Trump"
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
##Entrenamiento
log_model = classifier.fit(X=X_train, y=y_train)
#Clasificación
y_pred = log_model.predict(X_test)

#Se calculan las métricas de evaluación
lm = metrics.classification_report(y_test, y_pred, labels=list(set(data_labels)))

print(lm)

              precision    recall  f1-score   support

     SINCERE       0.76      0.66      0.71      2039
   INSINCERE       0.62      0.72      0.66      1518

    accuracy                           0.69      3557
   macro avg       0.69      0.69      0.69      3557
weighted avg       0.70      0.69      0.69      3557



In [None]:
%%shell
jupyter nbconvert --to html /content/TFM1_AnalisisLinguistico.ipynb