In [1]:
# Data manipulation
import pandas as pd # for data manipulation

# Visualization
import plotly.express as px # for data visualization
import plotly.graph_objects as go # for data visualization
import matplotlib.pyplot as plt # for showing confusion matrix

# Skleran
from sklearn.metrics import classification_report # for model evaluation metrics
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay # for showing confusion matrix
from sklearn.preprocessing import MinMaxScaler # for feature scaling
from sklearn.preprocessing import LabelEncoder
from sklearn.semi_supervised import LabelSpreading # for assigning labels to unlabeled data

#Procesamiento 
import nltk 
from nltk.tokenize import word_tokenize
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize import TweetTokenizer

#Modelo opcion 1
# evaluate label spreading on the semi-supervised learning dataset
from numpy import concatenate
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report # for model evaluation metrics
from sklearn.metrics import accuracy_score
from sklearn.semi_supervised import LabelSpreading
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer



#### CARGA DE DATOS

In [2]:
# Read in data
df = pd.read_csv('SuicidiosProyectoFinal.csv', sep=',', encoding = 'utf-8', index_col=0)

In [3]:
df.shape

(195700, 4)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,class,words
0,173271,i want to destroy myselffor once everything wa...,suicide,want destroy myselffor everyth start feel okay...
1,336321,I kind of got behind schedule with learning fo...,non-suicide,kind got behind schedul learn next week testwe...
2,256637,I am just not sure anymoreFirst and foremost: ...,suicide,sure anymorefirst foremost brazil judg second ...
3,303772,please give me a reason to liveThats too much ...,suicide,pleas give reason livethat much reason live li...
4,293747,27f struggling to find meaning moving forwardI...,suicide,struggl find mean move forwardi admit bit long...


#### MODELO LABEL SPREADING 

In [5]:
# Create a flag to denote whether the person has any dependants at home (either kids or teens)
df['type']=df.apply(lambda x: 1 if x['class']== 'suicide' else 0, axis=1)

# Randomly select 2% of observations to keep the label for. The rest of obs will have their labels masked
df['Rand_Selection'] = False
df.loc[df.sample(frac=0.02, random_state=42).index, 'Rand_Selection'] = True

# Show target value distribution
print('Target Value Distribution:')

# Print dataframe
df

Target Value Distribution:


Unnamed: 0.1,Unnamed: 0,text,class,words,type,Rand_Selection
0,173271,i want to destroy myselffor once everything wa...,suicide,want destroy myselffor everyth start feel okay...,1,False
1,336321,I kind of got behind schedule with learning fo...,non-suicide,kind got behind schedul learn next week testwe...,0,False
2,256637,I am just not sure anymoreFirst and foremost: ...,suicide,sure anymorefirst foremost brazil judg second ...,1,False
3,303772,please give me a reason to liveThats too much ...,suicide,pleas give reason livethat much reason live li...,1,False
4,293747,27f struggling to find meaning moving forwardI...,suicide,struggl find mean move forwardi admit bit long...,1,True
...,...,...,...,...,...,...
195695,248038,Drop some cool new cereal ideas Like what woul...,non-suicide,drop cool new cereal idea like would ideal cer...,0,False
195696,216516,Unpopular opinion but cats deserve love and re...,non-suicide,unpopular opinion cat deserv love respect much...,0,False
195697,199341,Hey guys :) How you all doin?,non-suicide,hey guy doin hey guy doin,0,False
195698,145373,uhm I covered my dog in a blanket because the ...,non-suicide,uhm cover dog blanket light wake woke ran wall...,0,False


In [6]:
df

Unnamed: 0.1,Unnamed: 0,text,class,words,type,Rand_Selection
0,173271,i want to destroy myselffor once everything wa...,suicide,want destroy myselffor everyth start feel okay...,1,False
1,336321,I kind of got behind schedule with learning fo...,non-suicide,kind got behind schedul learn next week testwe...,0,False
2,256637,I am just not sure anymoreFirst and foremost: ...,suicide,sure anymorefirst foremost brazil judg second ...,1,False
3,303772,please give me a reason to liveThats too much ...,suicide,pleas give reason livethat much reason live li...,1,False
4,293747,27f struggling to find meaning moving forwardI...,suicide,struggl find mean move forwardi admit bit long...,1,True
...,...,...,...,...,...,...
195695,248038,Drop some cool new cereal ideas Like what woul...,non-suicide,drop cool new cereal idea like would ideal cer...,0,False
195696,216516,Unpopular opinion but cats deserve love and re...,non-suicide,unpopular opinion cat deserv love respect much...,0,False
195697,199341,Hey guys :) How you all doin?,non-suicide,hey guy doin hey guy doin,0,False
195698,145373,uhm I covered my dog in a blanket because the ...,non-suicide,uhm cover dog blanket light wake woke ran wall...,0,False


In [7]:
df['words']= [word_tokenize(str(entry)) for entry in df['words']]

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,class,words,type,Rand_Selection
0,173271,i want to destroy myselffor once everything wa...,suicide,"[want, destroy, myselffor, everyth, start, fee...",1,False
1,336321,I kind of got behind schedule with learning fo...,non-suicide,"[kind, got, behind, schedul, learn, next, week...",0,False
2,256637,I am just not sure anymoreFirst and foremost: ...,suicide,"[sure, anymorefirst, foremost, brazil, judg, s...",1,False
3,303772,please give me a reason to liveThats too much ...,suicide,"[pleas, give, reason, livethat, much, reason, ...",1,False
4,293747,27f struggling to find meaning moving forwardI...,suicide,"[struggl, find, mean, move, forwardi, admit, b...",1,True


In [9]:
df_short = df.sample(1000)

In [10]:
df_short.shape

(1000, 6)

In [11]:
df_short.head()

Unnamed: 0.1,Unnamed: 0,text,class,words,type,Rand_Selection
3122,330363,I eat ass . I eat a lot of ass . From black to...,non-suicide,"[eat, ass, eat, lot, ass, black, brown, asian,...",0,False
193488,198145,prroud moment &amp;#x200B;\n\n*Processing img ...,non-suicide,"[prroud, moment, amp, process, img, prroud, mo...",0,False
60671,172152,"Nobody likes you, everyone left you, they are ...",non-suicide,"[nobodi, like, everyon, left, without, fun, da...",0,False
16483,317704,Is counseling scary at all?Iâve been having ...,suicide,"[counsel, scari, iav, thought, iav, start, cou...",1,False
177015,344042,Iâm going to stop procrastinating and start ...,non-suicide,"[iam, go, stop, procrastin, start, get, school...",0,False


In [12]:
def wordString (row):
    new_lista = [] 
    lista = row['words'] 
    for l in lista:
            palabra = str(l)
            nueva = palabra.lower()
            new_lista.append(palabra)
    row['words'] = new_lista        
    
df_short['words']=df_short.apply (lambda row: wordString(row), axis=1)  
    

In [13]:
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(df_short['words'].values)

AttributeError: 'NoneType' object has no attribute 'lower'

In [None]:
len(vectorizer.vocabulary_)

In [None]:
X = df_short['words']
y = df_short['type']

In [None]:
# Dividir los datos en entrenamiento y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1, stratify=y)

In [None]:
# Divide el entrenamiento en etiquetados y no etiquetados
X_train_lab, X_test_unlab, y_train_lab, y_test_unlab = train_test_split(X_train, y_train, test_size=0.30, random_state=1, stratify=y_train)

In [None]:
# summarize training set size
print('Labeled Train Set:', X_train_lab.shape, y_train_lab.shape)
print('Unlabeled Train Set:', X_test_unlab.shape, y_test_unlab.shape)
# summarize test set size
print('Test Set:', X_test.shape, y_test.shape)

In [None]:
# create the training dataset input
#X_train_mixed = concatenate((X_train_lab, X_test_unlab))

In [None]:
# create "no label" for unlabeled data
nolabel = [-1 for _ in range(len(y_test_unlab))]

In [None]:
# recombine training dataset labels
y_train_mixed = concatenate((y_train_lab, nolabel))

In [None]:
# define model
model = LabelSpreading()

In [None]:
LS = model.fit(X_train, y_train)

In [None]:
df_short['Predicted_label']=LS.transduction_

In [None]:
print(classification_report(df_short['type'], df_short['Predicted_label']))