#Imports

In [None]:
import string
import re

import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import ConfusionMatrixDisplay, precision_score, recall_score, f1_score

# Instalación de librerias
import warnings
warnings.filterwarnings('ignore')

from scipy import stats as st
import sys
from pandas_profiling import ProfileReport
import re, unicodedata
import inflect
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, plot_precision_recall_curve
import matplotlib.pyplot as plt



#Preparación

In [None]:
pd.set_option('display.max_colwidth', None)

# Descargando las stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Función para tokenizar los tweets
def tokenizer(text):
    tt = TweetTokenizer()
    return tt.tokenize(text)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Carga de Datos

In [None]:
datos = pd.read_csv("/content/SuicidiosProyecto.csv",sep=",",index_col=0)
df_suicidio = datos.copy()
df_suicidio.shape

(195700, 2)

Tenemos 195.700 mensajes

In [None]:
df_suicidio.sample(5)

Unnamed: 0_level_0,text,class
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1
131156,Posting everyday until I get a girlfriend day ...,non-suicide
118652,Two months to figure it out.Today i got what a...,suicide
170385,"In my early thirties, managers tell me I have ...",suicide
232671,It's my real life Birthday... ...in exactly SI...,non-suicide
180047,"I’m sad, Minecraft, A meme, and a callout post...",non-suicide


##Datos importantes

In [None]:
df_suicidio['class'].value_counts(dropna = False, normalize = True)

non-suicide    0.562928
suicide        0.437072
Name: class, dtype: float64

Proporción de mensajes categorizados com suicidas o no suicidas

## Limpieza

In [None]:
def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?¿\]\%', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\w*\d\w*', '', text)
    return text
 
round1 = lambda x: clean_text_round1(x)
 
df_suicidio_clean = pd.DataFrame(df_suicidio.text.apply(round1))
 
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…«»]', '', text)
    text = re.sub('\n', ' ', text)
    regex = '[\\!\\"\\#\\$\\%\\&\\\'\\(\\)\\*\\+\\,\\-\\.\\/\\:\\;\\<\\=\\>\\?\\@\\[\\\\\\]\\^_\\`\\{\\|\\}\\~]'
    text = re.sub(regex , ' ', text)
    # Eliminación de números
    text = re.sub("\d+", ' ', text)
    # Eliminación de espacios en blanco múltiples
    text = re.sub("\\s+", ' ', text)
    return text
 
round2 = lambda x: clean_text_round2(x)

 
df_suicidio_clean = pd.DataFrame(df_suicidio.text.apply(round2))


In [None]:
df_suicidio_clean['class'] = df_suicidio['class']
df_suicidio_clean.sample(5)

Unnamed: 0_level_0,text,class
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1
250715,so i finished my first month of second year i ...,non-suicide
244671,i cant forgive myself for what ive done in the...,suicide
142956,i can t go on i m writing this with the door l...,suicide
119441,i would just like to say the symbols tattooed ...,non-suicide
126873,bruh i really talking to my therapist like som...,non-suicide


In [None]:
# El parámetro 'stratify' es útil para asegurar que ambos conjuntos de datos queden aproximadamente balanceados
# Cuando se cuenta con muy pocos datos etiquetados, por lo general se debe disponer de una menor cantidad de datos para evaluación
X_train, X_test, y_train, y_test = train_test_split(df_suicidio_clean['text'], df_suicidio_clean['class'], test_size = 0.2, stratify = df_suicidio_clean['class'], random_state = 1)

In [None]:
bow = CountVectorizer(tokenizer = tokenizer, stop_words = stop_words, lowercase = True)
X_bow = bow.fit_transform(X_train)
len(bow.vocabulary_)

120051

In [None]:
tfidf = TfidfVectorizer(tokenizer = tokenizer, stop_words = stop_words, lowercase = True)
X_tfidf = tfidf.fit_transform(X_train)
len(tfidf.vocabulary_)

120051

#Modelos

## Entrenando y evaluando un modelo usado BoW

### Random Forest

In [None]:
bow_model = RandomForestClassifier(random_state = 2)
bow_model.fit(X_bow, y_train)

RandomForestClassifier(random_state=2)

In [None]:
y_train_bow_predict = bow_model.predict(X_bow)
y_test_bow_predict = bow_model.predict(bow.transform(X_test))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_train, y_train_bow_predict)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7fa79fc4a2d0>

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_test_bow_predict)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7fa79e435910>

In [None]:
if len(bow_model.classes_) == 2:
    print('Precision:', precision_score(y_train, y_train_bow_predict, pos_label = 'suicide'))
    print('Recall:', recall_score(y_train, y_train_bow_predict, pos_label = 'suicide'))
    print('F1:', f1_score(y_train, y_train_bow_predict, pos_label = 'suicide'))
else:
    # Para casos no binarios, es común calcular las métricas para cada clase
    print('Precision:', precision_score(y_train, y_train_bow_predict, average = None))
    print('Recall:', recall_score(y_train, y_train_bow_predict, average = None))
    print('F1:', f1_score(y_train, y_train_bow_predict, average = None))

Precision: 0.999956154453246
Recall: 0.9998684748933185
F1: 0.9999123127511875


In [None]:
if len(bow_model.classes_) == 2:
    print('Precision:', precision_score(y_test, y_test_bow_predict, pos_label = 'suicide'))
    print('Recall:', recall_score(y_test, y_test_bow_predict, pos_label = 'suicide'))
    print('F1:', f1_score(y_test, y_test_bow_predict, pos_label = 'suicide'))
else:
    # Para casos no binarios, es común calcular las métricas para cada clase
    print('Precision:', precision_score(y_test, y_test_bow_predict, average = None))
    print('Recall:', recall_score(y_test, y_test_bow_predict, average = None))
    print('F1:', f1_score(y_test, y_test_bow_predict, average = None))

Precision: 0.8740677966101695
Recall: 0.904366633541825
F1: 0.88895911741891


###MultinomialNB

In [None]:
bow_model = MultinomialNB()
bow_model.fit(X_bow, y_train)

MultinomialNB()

In [None]:
y_train_bow_predict = bow_model.predict(X_bow)
y_test_bow_predict = bow_model.predict(bow.transform(X_test))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_train, y_train_bow_predict)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f498c3fb410>

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_test_bow_predict)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f498faec3d0>

In [None]:
if len(bow_model.classes_) == 2:
    print('Precision:', precision_score(y_train, y_train_bow_predict, pos_label = 'suicide'))
    print('Recall:', recall_score(y_train, y_train_bow_predict, pos_label = 'suicide'))
    print('F1:', f1_score(y_train, y_train_bow_predict, pos_label = 'suicide'))
else:
    # Para casos no binarios, es común calcular las métricas para cada clase
    print('Precision:', precision_score(y_train, y_train_bow_predict, average = None))
    print('Recall:', recall_score(y_train, y_train_bow_predict, average = None))
    print('F1:', f1_score(y_train, y_train_bow_predict, average = None))

Precision: 0.8091389040597123
Recall: 0.9742941485941428
F1: 0.884069405859849


In [None]:
if len(bow_model.classes_) == 2:
    print('Precision:', precision_score(y_test, y_test_bow_predict, pos_label = 'suicide'))
    print('Recall:', recall_score(y_test, y_test_bow_predict, pos_label = 'suicide'))
    print('F1:', f1_score(y_test, y_test_bow_predict, pos_label = 'suicide'))
else:
    # Para casos no binarios, es común calcular las métricas para cada clase
    print('Precision:', precision_score(y_test, y_test_bow_predict, average = None))
    print('Recall:', recall_score(y_test, y_test_bow_predict, average = None))
    print('F1:', f1_score(y_test, y_test_bow_predict, average = None))

Precision: 0.7930919802356519
Recall: 0.9757993803706085
F1: 0.8750098283318044


### SVM

In [None]:
bow_model = SGDClassifier(loss='hinge', alpha=1e-3, max_iter=10, random_state=0)
bow_model.fit(X_bow, y_train)

SGDClassifier(alpha=0.001, max_iter=10, random_state=0)

In [None]:
y_train_bow_predict = bow_model.predict(X_bow)
y_test_bow_predict = bow_model.predict(bow.transform(X_test))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_train, y_train_bow_predict)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f498fba9210>

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_test_bow_predict)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f498fc5ff10>

In [None]:
if len(bow_model.classes_) == 2:
    print('Precision:', precision_score(y_train, y_train_bow_predict, pos_label = 'suicide'))
    print('Recall:', recall_score(y_train, y_train_bow_predict, pos_label = 'suicide'))
    print('F1:', f1_score(y_train, y_train_bow_predict, pos_label = 'suicide'))
else:
    # Para casos no binarios, es común calcular las métricas para cada clase
    print('Precision:', precision_score(y_train, y_train_bow_predict, average = None))
    print('Recall:', recall_score(y_train, y_train_bow_predict, average = None))
    print('F1:', f1_score(y_train, y_train_bow_predict, average = None))

Precision: 0.9471070673415775
Recall: 0.864061495294324
F1: 0.903680381488048


In [None]:
if len(bow_model.classes_) == 2:
    print('Precision:', precision_score(y_test, y_test_bow_predict, pos_label = 'suicide'))
    print('Recall:', recall_score(y_test, y_test_bow_predict, pos_label = 'suicide'))
    print('F1:', f1_score(y_test, y_test_bow_predict, pos_label = 'suicide'))
else:
    # Para casos no binarios, es común calcular las métricas para cada clase
    print('Precision:', precision_score(y_test, y_test_bow_predict, average = None))
    print('Recall:', recall_score(y_test, y_test_bow_predict, average = None))
    print('F1:', f1_score(y_test, y_test_bow_predict, average = None))

Precision: 0.9406865861411315
Recall: 0.8649675571403519
F1: 0.90123945549228


### Passive Aggressive Classifier

In [None]:
bow_model = PassiveAggressiveClassifier(random_state=8)
bow_model.fit(X_bow, y_train)

PassiveAggressiveClassifier(random_state=8)

In [None]:
y_train_bow_predict = bow_model.predict(X_bow)
y_test_bow_predict = bow_model.predict(bow.transform(X_test))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_train, y_train_bow_predict)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f498f6356d0>

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_test_bow_predict)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f498f948e10>

In [None]:
if len(bow_model.classes_) == 2:
    print('Precision:', precision_score(y_train, y_train_bow_predict, pos_label = 'suicide'))
    print('Recall:', recall_score(y_train, y_train_bow_predict, pos_label = 'suicide'))
    print('F1:', f1_score(y_train, y_train_bow_predict, pos_label = 'suicide'))
else:
    # Para casos no binarios, es común calcular las métricas para cada clase
    print('Precision:', precision_score(y_train, y_train_bow_predict, average = None))
    print('Recall:', recall_score(y_train, y_train_bow_predict, average = None))
    print('F1:', f1_score(y_train, y_train_bow_predict, average = None))

Precision: 0.9479403552203576
Recall: 0.9624861167942947
F1: 0.955157861151799


In [None]:
if len(bow_model.classes_) == 2:
    print('Precision:', precision_score(y_test, y_test_bow_predict, pos_label = 'suicide'))
    print('Recall:', recall_score(y_test, y_test_bow_predict, pos_label = 'suicide'))
    print('F1:', f1_score(y_test, y_test_bow_predict, pos_label = 'suicide'))
else:
    # Para casos no binarios, es común calcular las métricas para cada clase
    print('Precision:', precision_score(y_test, y_test_bow_predict, average = None))
    print('Recall:', recall_score(y_test, y_test_bow_predict, average = None))
    print('F1:', f1_score(y_test, y_test_bow_predict, average = None))

Precision: 0.902163196659514
Recall: 0.9093353597942363
F1: 0.9057350800582241


## Entrenando y evaluando un modelo usado TF-IDF

### MultinomialNB


In [None]:
tfidf_model = MultinomialNB()
tfidf_model.fit(X_tfidf, y_train)

MultinomialNB()

In [None]:
y_train_tfidf_predict = tfidf_model.predict(X_tfidf)
y_test_tfidf_predict = tfidf_model.predict(tfidf.transform(X_test))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_train, y_train_tfidf_predict)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f498c671f10>

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_test_tfidf_predict)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f498f641f50>

In [None]:
if len(tfidf_model.classes_) == 2:
    print('Precision:', precision_score(y_train, y_train_tfidf_predict, pos_label = 'suicide'))
    print('Recall:', recall_score(y_train, y_train_tfidf_predict, pos_label = 'suicide'))
    print('F1:', f1_score(y_train, y_train_tfidf_predict, pos_label = 'suicide'))
else:
    # Para casos no binarios, es común calcular las métricas para cada clase
    print('Precision:', precision_score(y_train, y_train_tfidf_predict, average = None))
    print('Recall:', recall_score(y_train, y_train_tfidf_predict, average = None))
    print('F1:', f1_score(y_train, y_train_tfidf_predict, average = None))

Precision: 0.860558341167868
Recall: 0.9631291284269597
F1: 0.9089592585441205


In [None]:
if len(tfidf_model.classes_) == 2:
    print('Precision:', precision_score(y_test, y_test_tfidf_predict, pos_label = 'suicide'))
    print('Recall:', recall_score(y_test, y_test_tfidf_predict, pos_label = 'suicide'))
    print('F1:', f1_score(y_test, y_test_tfidf_predict, pos_label = 'suicide'))
else:
    # Para casos no binarios, es común calcular las métricas para cada clase
    print('Precision:', precision_score(y_test, y_test_tfidf_predict, average = None))
    print('Recall:', recall_score(y_test, y_test_tfidf_predict, average = None))
    print('F1:', f1_score(y_test, y_test_tfidf_predict, average = None))

Precision: 0.8462052997170054
Recall: 0.961360846437131
F1: 0.900114936237754


### SVM

In [None]:
tfidf_model = SGDClassifier(loss='hinge', alpha=1e-3, max_iter=10, random_state=0)
tfidf_model.fit(X_tfidf, y_train)

SGDClassifier(alpha=0.001, max_iter=10, random_state=0)

In [None]:
y_train_tfidf_predict = tfidf_model.predict(X_tfidf)
y_test_tfidf_predict = tfidf_model.predict(tfidf.transform(X_test))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_train, y_train_tfidf_predict)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f498c671550>

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_test_tfidf_predict)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f498f8d7ed0>

In [None]:
if len(tfidf_model.classes_) == 2:
    print('Precision:', precision_score(y_train, y_train_tfidf_predict, pos_label = 'suicide'))
    print('Recall:', recall_score(y_train, y_train_tfidf_predict, pos_label = 'suicide'))
    print('F1:', f1_score(y_train, y_train_tfidf_predict, pos_label = 'suicide'))
else:
    # Para casos no binarios, es común calcular las métricas para cada clase
    print('Precision:', precision_score(y_train, y_train_tfidf_predict, average = None))
    print('Recall:', recall_score(y_train, y_train_tfidf_predict, average = None))
    print('F1:', f1_score(y_train, y_train_tfidf_predict, average = None))

Precision: 0.9316019328805566
Recall: 0.8198690594493482
F1: 0.8721715675986599


In [None]:
if len(tfidf_model.classes_) == 2:
    print('Precision:', precision_score(y_test, y_test_tfidf_predict, pos_label = 'suicide'))
    print('Recall:', recall_score(y_test, y_test_tfidf_predict, pos_label = 'suicide'))
    print('F1:', f1_score(y_test, y_test_tfidf_predict, pos_label = 'suicide'))
else:
    # Para casos no binarios, es común calcular las métricas para cada clase
    print('Precision:', precision_score(y_test, y_test_tfidf_predict, average = None))
    print('Recall:', recall_score(y_test, y_test_tfidf_predict, average = None))
    print('F1:', f1_score(y_test, y_test_tfidf_predict, average = None))

Precision: 0.9278992796417678
Recall: 0.8357982112585491
F1: 0.8794439660474842


### Passive Aggressive Classifier

In [None]:
tfidf_model = PassiveAggressiveClassifier(random_state=8)
tfidf_model.fit(X_tfidf, y_train)

PassiveAggressiveClassifier(random_state=8)

In [None]:
y_train_tfidf_predict = tfidf_model.predict(X_tfidf)
y_test_tfidf_predict = tfidf_model.predict(tfidf.transform(X_test))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_train, y_train_tfidf_predict)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f498f7645d0>

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_test_tfidf_predict)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f498fad23d0>

In [None]:
if len(tfidf_model.classes_) == 2:
    print('Precision:', precision_score(y_train, y_train_tfidf_predict, pos_label = 'suicide'))
    print('Recall:', recall_score(y_train, y_train_tfidf_predict, pos_label = 'suicide'))
    print('F1:', f1_score(y_train, y_train_tfidf_predict, pos_label = 'suicide'))
else:
    # Para casos no binarios, es común calcular las métricas para cada clase
    print('Precision:', precision_score(y_train, y_train_tfidf_predict, average = None))
    print('Recall:', recall_score(y_train, y_train_tfidf_predict, average = None))
    print('F1:', f1_score(y_train, y_train_tfidf_predict, average = None))

Precision: 0.9848164099569312
Recall: 0.9857806745776583
F1: 0.9852983063473632


In [None]:
if len(tfidf_model.classes_) == 2:
    print('Precision:', precision_score(y_test, y_test_tfidf_predict, pos_label = 'suicide'))
    print('Recall:', recall_score(y_test, y_test_tfidf_predict, pos_label = 'suicide'))
    print('F1:', f1_score(y_test, y_test_tfidf_predict, pos_label = 'suicide'))
else:
    # Para casos no binarios, es común calcular las métricas para cada clase
    print('Precision:', precision_score(y_test, y_test_tfidf_predict, average = None))
    print('Recall:', recall_score(y_test, y_test_tfidf_predict, average = None))
    print('F1:', f1_score(y_test, y_test_tfidf_predict, average = None))

Precision: 0.9238209672574347
Recall: 0.8988718068626877
F1: 0.9111756340365016
