In [9]:
import nltk
import pandas as pd
import numpy as np
import re
import emoji

from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.svm import SVC
from sklearn import metrics
nltk.download('stopwords')
nltk.download('wordnet')

# Dataset link: https://github.com/ayaanzhaque/SDCNL/tree/main/data

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anca.ilicea\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\anca.ilicea\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
print(train.shape)
print(test.shape)
df_train = train[["selftext", "is_suicide"]]
df_test = test[["selftext", "is_suicide"]]

(1516, 13)
(379, 13)


In [11]:
def remove_stopwords(post):
    stop_words_nltk = set(stopwords.words('english'))
    processed = [word for word in post if not word in stop_words_nltk]

    return processed

In [12]:
def lemmatize(post):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in post]

In [13]:
def preprocess(posts):
    prepocessed_posts = []
    for post in posts:
        post = post.lower()
        post = post.replace("\n", "")
        post = re.sub(r'@\w+ ?', '', post)
        post = re.sub(r'#\w+ ?', '', post)
        post = re.sub(r'http\S+', '', post)
        post = re.sub(r'\d+', '', post)
        post = re.sub(r'[^\w\s]',' ', post)
        post = emoji.get_emoji_regexp().sub(u'', post)

        post = post.split()
        post = remove_stopwords(post)
        post = lemmatize(post)
        
        prepocessed_posts.append(post)
    return prepocessed_posts

preprocessed_train = preprocess(df_train["selftext"])
preprocessed_test = preprocess(df_test["selftext"])

In [14]:
vectorizer = CountVectorizer(lowercase = False)
print(df_train["selftext"])
vect = vectorizer.fit(df_train["selftext"])

bow_train_raw = vect.transform(df_train["selftext"])
bow_test_raw = vect.transform(df_test["selftext"])
vocabulary = vectorizer.get_feature_names()

print('Size of vocabulary for raw train data:', len(vocabulary))

0       Hi I don't really know how to phrase this situ...
1       i have been so depressed these past couple wee...
2       Hi..I don't know where else to go. I am devast...
3       The shit like “it will get better, everyone is...
4       I wish I was prettier. I wish I didn’t feel li...
                              ...                        
1511    Every night it’s “you guys want ice cream?” Af...
1512    I would like to say that I was shook, but I kn...
1513    I can't take this anymore. I've been wanting t...
1514                                            emptypost
1515    I feel like people are controlling every aspec...
Name: selftext, Length: 1516, dtype: object
Size of vocabulary for raw train data: 11287




In [15]:
str_train = [" ".join(text) for text in preprocessed_train]
str_test = [" ".join(text) for text in preprocessed_test]

In [16]:
vectorizer = CountVectorizer(lowercase = True)

vect = vectorizer.fit(str_train)

bow_train = vect.transform(str_train)
bow_test = vect.transform(str_test)

vocabulary = vectorizer.get_feature_names_out()

print('Size of vocabulary for processed train data:', len(vocabulary))

Size of vocabulary for processed train data: 8683


In [23]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=7)
neigh.fit(bow_train_raw, df_train["is_suicide"])
pred_labels = neigh.predict(bow_test_raw)
print('Result without processing text:', metrics.accuracy_score(df_test["is_suicide"], pred_labels))

Result without processing text: 0.5197889182058048


In [24]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=7)
neigh.fit(bow_train, df_train["is_suicide"])
pred_labels = neigh.predict(bow_test)
print('Result without processing text:', metrics.accuracy_score(df_test["is_suicide"], pred_labels))

Result without processing text: 0.5620052770448549


In [None]:
bestC = 1.0

clf = SVC(kernel='linear', C = bestC)
clf.fit(bow_train_raw, df_train["is_suicide"])

pred_labels = clf.predict(bow_test_raw)
print('Result without processing text:', metrics.accuracy_score(df_test["is_suicide"], pred_labels))

Result without processing text: 0.6437994722955145


In [None]:
bestC = 1.0

clf = SVC(kernel='linear', C = bestC)
clf.fit(bow_train, df_train["is_suicide"])

pred_labels = clf.predict(bow_test)
print('Result with processing text:', metrics.accuracy_score(df_test["is_suicide"], pred_labels))

Result with processing text: 0.6649076517150396
