# Proyecto 1

In [67]:
# librería Natural Language Toolkit, usada para trabajar con textos 
import nltk
# Punkt permite separar un texto en frases.
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [68]:
# Descarga todas las palabras vacias, es decir, aquellas que no aportan nada al significado del texto

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [69]:
# Descarga de paquete WordNetLemmatizer, este es usado para encontrar el lema de cada palabra
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# Importaciones 

In [70]:
 # Instalación de librerias
import pandas as pd
import numpy as np
import sys
from markupsafe import escape
#from pandas_profiling import ProfileReport 

import re, string, unicodedata
import contractions
import inflect
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report, confusion_matrix, plot_precision_recall_curve
from sklearn.base import BaseEstimator, ClassifierMixin

import matplotlib.pyplot as plt

# Carga de datos

In [71]:
data=pd.read_csv('SuicidiosProyecto.csv', sep=',', encoding = 'utf-8')
data_s=data

In [72]:
data_s.head()

Unnamed: 0.1,Unnamed: 0,text,class
0,173271,i want to destroy myselffor once everything wa...,suicide
1,336321,I kinda got behind schedule with learning for ...,non-suicide
2,256637,I'm just not sure anymoreFirst and foremost: I...,suicide
3,303772,please give me a reason to liveThats too much ...,suicide
4,293747,27f struggling to find meaning moving forwardI...,suicide


In [73]:
data_s['class'].value_counts()

non-suicide    110165
suicide         85535
Name: class, dtype: int64

# Limpieza de datos

### Remover puntuacion

In [74]:
#defining the function to remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree
#storing the puntuation free text
data_s['words']= data_s['text'].apply(lambda x:remove_punctuation(x))
data_s.head()

Unnamed: 0.1,Unnamed: 0,text,class,words
0,173271,i want to destroy myselffor once everything wa...,suicide,i want to destroy myselffor once everything wa...
1,336321,I kinda got behind schedule with learning for ...,non-suicide,I kinda got behind schedule with learning for ...
2,256637,I'm just not sure anymoreFirst and foremost: I...,suicide,Im just not sure anymoreFirst and foremost Im ...
3,303772,please give me a reason to liveThats too much ...,suicide,please give me a reason to liveThats too much ...
4,293747,27f struggling to find meaning moving forwardI...,suicide,27f struggling to find meaning moving forwardI...


### Transformar todo a minusculas

In [75]:
data_s['words']= data_s['words'].apply(lambda x: x.lower())
data_s.head()

Unnamed: 0.1,Unnamed: 0,text,class,words
0,173271,i want to destroy myselffor once everything wa...,suicide,i want to destroy myselffor once everything wa...
1,336321,I kinda got behind schedule with learning for ...,non-suicide,i kinda got behind schedule with learning for ...
2,256637,I'm just not sure anymoreFirst and foremost: I...,suicide,im just not sure anymorefirst and foremost im ...
3,303772,please give me a reason to liveThats too much ...,suicide,please give me a reason to livethats too much ...
4,293747,27f struggling to find meaning moving forwardI...,suicide,27f struggling to find meaning moving forwardi...


### Tokenizar

In [76]:
#defining function for tokenization
import re
def tokenization(text):
    tokens = re.split(' ',text)
    return tokens
#applying function to the column
data_s['words']= data_s['words'].apply(lambda x: tokenization(x))
data_s.head()

Unnamed: 0.1,Unnamed: 0,text,class,words
0,173271,i want to destroy myselffor once everything wa...,suicide,"[i, want, to, destroy, myselffor, once, everyt..."
1,336321,I kinda got behind schedule with learning for ...,non-suicide,"[i, kinda, got, behind, schedule, with, learni..."
2,256637,I'm just not sure anymoreFirst and foremost: I...,suicide,"[im, just, not, sure, anymorefirst, and, forem..."
3,303772,please give me a reason to liveThats too much ...,suicide,"[please, give, me, a, reason, to, livethats, t..."
4,293747,27f struggling to find meaning moving forwardI...,suicide,"[27f, struggling, to, find, meaning, moving, f..."


### Remover stopwords

In [77]:
stopwords = nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output
data_s['words']= data_s['words'].apply(lambda x: remove_stopwords(x))
data_s.head()

Unnamed: 0.1,Unnamed: 0,text,class,words
0,173271,i want to destroy myselffor once everything wa...,suicide,"[want, destroy, myselffor, everything, startin..."
1,336321,I kinda got behind schedule with learning for ...,non-suicide,"[kinda, got, behind, schedule, learning, next,..."
2,256637,I'm just not sure anymoreFirst and foremost: I...,suicide,"[im, sure, anymorefirst, foremost, im, brazil,..."
3,303772,please give me a reason to liveThats too much ...,suicide,"[please, give, reason, livethats, much, dont, ..."
4,293747,27f struggling to find meaning moving forwardI...,suicide,"[27f, struggling, find, meaning, moving, forwa..."


### Remover non-ASCII

In [78]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words
data_s['words']= data_s['words'].apply(lambda x: remove_non_ascii(x))
data_s.head()

Unnamed: 0.1,Unnamed: 0,text,class,words
0,173271,i want to destroy myselffor once everything wa...,suicide,"[want, destroy, myselffor, everything, startin..."
1,336321,I kinda got behind schedule with learning for ...,non-suicide,"[kinda, got, behind, schedule, learning, next,..."
2,256637,I'm just not sure anymoreFirst and foremost: I...,suicide,"[im, sure, anymorefirst, foremost, im, brazil,..."
3,303772,please give me a reason to liveThats too much ...,suicide,"[please, give, reason, livethats, much, dont, ..."
4,293747,27f struggling to find meaning moving forwardI...,suicide,"[27f, struggling, find, meaning, moving, forwa..."


### Stemming

In [79]:
porter_stemmer = PorterStemmer()
#defining a function for stemming
def stemming(text):
    stem_text = [porter_stemmer.stem(word) for word in text]
    return stem_text

data_s['words']=data_s['words'].apply(lambda x: stemming(x))

### Lemmatization

In [80]:
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text

data_s['words']=data_s['words'].apply(lambda x:lemmatizer(x))

In [81]:
data_s.head()

Unnamed: 0.1,Unnamed: 0,text,class,words
0,173271,i want to destroy myselffor once everything wa...,suicide,"[want, destroy, myselffor, everyth, start, fee..."
1,336321,I kinda got behind schedule with learning for ...,non-suicide,"[kinda, got, behind, schedul, learn, next, wee..."
2,256637,I'm just not sure anymoreFirst and foremost: I...,suicide,"[im, sure, anymorefirst, foremost, im, brazil,..."
3,303772,please give me a reason to liveThats too much ...,suicide,"[plea, give, reason, livethat, much, dont, rea..."
4,293747,27f struggling to find meaning moving forwardI...,suicide,"[27f, struggl, find, mean, move, forwardi, adm..."


In [82]:
data_s.shape

(195700, 4)

# Prueba

In [83]:
word2count = {}
def bag_of_words(words):
    for word in words:
        if word == "":
            pass
        elif word not in word2count.keys():
            word2count[word] = 1
        else:
            word2count[word] += 1

In [84]:
data_s['words'].apply(lambda x:bag_of_words(x))

0         None
1         None
2         None
3         None
4         None
          ... 
195695    None
195696    None
195697    None
195698    None
195699    None
Name: words, Length: 195700, dtype: object

In [85]:
print(word2count)



In [90]:
import heapq
freq_words = heapq.nlargest(100, word2count, key=word2count.get)

In [91]:
print(freq_words)

['', 'im', 'dont', 'like', 'want', 'feel', 'get', 'know', 'go', 'life', 'time', 'cant', 'ive', 'think', 'fuck', 'peopl', 'one', 'friend', 'even', 'year', 'realli', 'would', 'make', 'tri', 'thing', 'day', 'filler', 'help', 'live', 'never', 'talk', 'much', 'say', 'suicid', 'love', 'need', 'kill', 'end', 'see', 'got', 'die', 'work', 'thought', 'way', 'school', 'good', 'someon', 'start', 'take', 'could', 'anyth', 'back', 'still', 'someth', 'tell', 'hate', 'didnt', 'anymor', 'depress', 'care', 'alway', 'anyon', 'everyth', 'famili', 'better', 'person', 'noth', 'ill', 'everi', 'look', 'right', 'shit', 'keep', 'post', 'ever', 'last', 'everyon', 'parent', 'it', 'come', 'happi', 'stop', 'bad', 'mom', 'give', 'sinc', 'told', 'ask', 'month', 'girl', 'said', 'that', 'find', 'guy', 'use', 'job', 'point', 'made', 'also', 'doesnt']


### Selección de campos

In [86]:
#data_s['words'] = data_s['words'].apply(lambda x: ' '.join(map(str, x)))
#data_s

Unnamed: 0.1,Unnamed: 0,text,class,words
0,173271,i want to destroy myselffor once everything wa...,suicide,want destroy myselffor everyth start feel okay...
1,336321,I kinda got behind schedule with learning for ...,non-suicide,kinda got behind schedul learn next week testw...
2,256637,I'm just not sure anymoreFirst and foremost: I...,suicide,im sure anymorefirst foremost im brazil dont j...
3,303772,please give me a reason to liveThats too much ...,suicide,plea give reason livethat much dont reason liv...
4,293747,27f struggling to find meaning moving forwardI...,suicide,27f struggl find mean move forwardi admit bit ...
...,...,...,...,...
195695,248038,Drop some cool new cereal ideas Like what woul...,non-suicide,drop cool new cereal idea like would ideal cereal
195696,216516,Unpopular opinion but cats deserve love and re...,non-suicide,unpopular opinion cat deserv love respect much...
195697,199341,Hey guys :) How yall doin?,non-suicide,hey guy yall doin
195698,145373,uhm I covered my dog in a blanket because the ...,non-suicide,uhm cover dog blanket light wont wake woke ran...


In [87]:
X_data, y_data = data_s['words'],data_s['class']
y_data = (y_data == 'suicide').astype(int)
y_data

0         1
1         0
2         1
3         1
4         1
         ..
195695    0
195696    0
195697    0
195698    0
195699    1
Name: class, Length: 195700, dtype: int32

In [88]:
dummy = CountVectorizer(binary=True)
X_dummy = dummy.fit_transform(X_data)
print(X_dummy.shape)
#X_dummy.toarray()[0]

(195700, 168218)


In [89]:
count = CountVectorizer()
X_count = count.fit_transform(X_data)
print(X_count.shape)

(195700, 168218)
