In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('Suicide_Detection.csv', index_col="Unnamed: 0")
df

Unnamed: 0,text,class
2,Ex Wife Threatening SuicideRecently I left my ...,suicide
3,Am I weird I don't get affected by compliments...,non-suicide
4,Finally 2020 is almost over... So I can never ...,non-suicide
8,i need helpjust help me im crying so hard,suicide
9,"I’m so lostHello, my name is Adam (16) and I’v...",suicide
...,...,...
348103,If you don't like rock then your not going to ...,non-suicide
348106,You how you can tell i have so many friends an...,non-suicide
348107,pee probably tastes like salty tea😏💦‼️ can som...,non-suicide
348108,The usual stuff you find hereI'm not posting t...,suicide


In [3]:
from sklearn.preprocessing import OneHotEncoder

In [4]:
text = pd.Series(df['text'])
text

2         Ex Wife Threatening SuicideRecently I left my ...
3         Am I weird I don't get affected by compliments...
4         Finally 2020 is almost over... So I can never ...
8                 i need helpjust help me im crying so hard
9         I’m so lostHello, my name is Adam (16) and I’v...
                                ...                        
348103    If you don't like rock then your not going to ...
348106    You how you can tell i have so many friends an...
348107    pee probably tastes like salty tea😏💦‼️ can som...
348108    The usual stuff you find hereI'm not posting t...
348110    I still haven't beaten the first boss in Hollo...
Name: text, Length: 232074, dtype: object

In [5]:
suicide = pd.Series(pd.get_dummies(df['class'], drop_first=True)['suicide'])
suicide

2         1
3         0
4         0
8         1
9         1
         ..
348103    0
348106    0
348107    0
348108    1
348110    0
Name: suicide, Length: 232074, dtype: uint8

In [6]:
df = { 'Text': text, 'suicide': suicide }
df = pd.DataFrame(df)
df = df.reset_index().drop('index', axis = 1)
df

Unnamed: 0,Text,suicide
0,Ex Wife Threatening SuicideRecently I left my ...,1
1,Am I weird I don't get affected by compliments...,0
2,Finally 2020 is almost over... So I can never ...,0
3,i need helpjust help me im crying so hard,1
4,"I’m so lostHello, my name is Adam (16) and I’v...",1
...,...,...
232069,If you don't like rock then your not going to ...,0
232070,You how you can tell i have so many friends an...,0
232071,pee probably tastes like salty tea😏💦‼️ can som...,0
232072,The usual stuff you find hereI'm not posting t...,1


In [7]:
train_data, test_data = train_test_split(df, test_size=0.25, random_state=10)

In [8]:
train_data.shape, test_data.shape

((174055, 2), (58019, 2))

In [9]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
import string
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/agoyal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/agoyal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/agoyal/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/agoyal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
lemmatizer = WordNetLemmatizer()
stop = stopwords.words("english")
punctuations = list(string.punctuation)
stop += punctuations

In [12]:
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN 

In [13]:
def clean(words):
    output_words = []
    for w in words:
        if w.lower() not in stop:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1])).lower()
            output_words.append(clean_word)
    return output_words

In [14]:
Xtrain_text = np.array(train_data["Text"].astype(str))

In [15]:
Xtrain_text = [clean(word_tokenize(words)) for words in Xtrain_text]

In [16]:
Xtrain_text = [" ".join(words) for words in Xtrain_text]

In [48]:
count_vec = CountVectorizer(max_features = 2000)
temp = count_vec.fit_transform(Xtrain_text)

In [49]:
X_train_features = temp.todense()

In [40]:
Xtest_text = np.array(test_data["Text"].astype(str))

In [41]:
Xtest_text = [clean(word_tokenize(words)) for words in Xtest_text]
Xtest_text = [" ".join(words) for words in Xtest_text]

In [50]:
X_test_features = count_vec.transform(Xtest_text).todense()

In [43]:
X_test_features

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [44]:
np.array(test_data["suicide"])

array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)

In [52]:
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.metrics import accuracy_score

In [46]:
clf = rf()

In [51]:
clf.fit(X_train_features, train_data['suicide'])

RandomForestClassifier()

In [53]:
y = clf.predict(X_test_features)

In [55]:
accuracy_score(test_data['suicide'], y)

0.895068856753822