<a href="https://colab.research.google.com/github/BaglanCV/Predicting-Suicide-Tendency-in-Twitter-Data/blob/main/Predicting_Suicide_Tendency_in_Twitter_Data_with_TFIDF_and_Sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/twitter-suicidal-intention-dataset/master/twitter-suicidal_data.csv')
df

Unnamed: 0,tweet,intention
0,my life is meaningless i just want to end my l...,1
1,muttering i wanna die to myself daily for a fe...,1
2,work slave i really feel like my only purpose ...,1
3,i did something on the 2 of october i overdose...,1
4,i feel like no one cares i just want to die ma...,1
...,...,...
9114,have you ever laid on your bed at night and cr...,1
9115,the fault the blame the pain s still there i m...,1
9116,stop asking me to trust you when i m still cou...,1
9117,i never know how to handle sadness crying make...,1


In [3]:
df['intention'].value_counts()

0    5121
1    3998
Name: intention, dtype: int64

In [4]:
df['tweet'][7878]

'don t want to live without teeth don t want to die without bite i never want to say that i regret it'

#Data Preprocessing

In [5]:
!pip install git+https://github.com/laxmimerit/preprocess_kgptalkie.git --upgrade --force-reinstall

Collecting git+https://github.com/laxmimerit/preprocess_kgptalkie.git
  Cloning https://github.com/laxmimerit/preprocess_kgptalkie.git to /tmp/pip-req-build-wooe0v3m
  Running command git clone -q https://github.com/laxmimerit/preprocess_kgptalkie.git /tmp/pip-req-build-wooe0v3m
Building wheels for collected packages: preprocess-kgptalkie
  Building wheel for preprocess-kgptalkie (setup.py) ... [?25l[?25hdone
  Created wheel for preprocess-kgptalkie: filename=preprocess_kgptalkie-0.1.3-cp37-none-any.whl size=11743 sha256=85892b2ccea94d435ee9529c2d0bec0e9ff07e2555819dbb028db78b1f4b01ed
  Stored in directory: /tmp/pip-ephem-wheel-cache-6u6jx0bb/wheels/a8/18/22/90afa4bd43247fb9a75b710a4a3fcd94966c022ce9e3c7d0a6
Successfully built preprocess-kgptalkie
Installing collected packages: preprocess-kgptalkie
  Found existing installation: preprocess-kgptalkie 0.1.3
    Uninstalling preprocess-kgptalkie-0.1.3:
      Successfully uninstalled preprocess-kgptalkie-0.1.3
Successfully installed prep

In [6]:
import preprocess_kgptalkie as ps

In [7]:
import re

In [8]:
def get_clean(x):
    x = str(x).lower().replace('\\', '').replace('_', ' ')
    x = ps.cont_exp(x)
    x = ps.remove_emails(x)
    x = ps.remove_urls(x)
    x = ps.remove_html_tags(x)
    x = ps.remove_rt(x)
    x = ps.remove_accented_chars(x)
    x = ps.remove_special_chars(x)
    x = re.sub("(.)\\1{2,}", "\\1", x)
    return x
    #x = 'mmmmiiissssss youuuu'
    #x = re.sub("(.)\\1{2,}", "\\1", x)
    #print(x)
    ---
    #miss you

In [9]:
df['tweet'] = df['tweet'].apply(lambda x:get_clean(x))

In [10]:
df.head()

Unnamed: 0,tweet,intention
0,my life is meaningless i just want to end my l...,1
1,muttering i wanna die to myself daily for a fe...,1
2,work slave i really feel like my only purpose ...,1
3,i did something on the 2 of october i overdose...,1
4,i feel like no one cares i just want to die ma...,1


#TFIDF, TRAIN_TEST_SPLIT

In [11]:
import sklearn as sk

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

In [13]:
tfidf = TfidfVectorizer(max_features = 20000, ngram_range=(1,3), analyzer= 'char') #we'll take into account only top 20000 dictionary words(max_features) 
#ngram_range is unigram, biogram and triagram, it means there are a single word, a combination of two and three words
#analyzer= 'char' -a tokenization of the text data is done character by character, it gives better result than a word analyzer

In [14]:
X = tfidf.fit_transform(df['tweet'])
y = df['intention']

In [15]:
X.shape #9119 raws and each raw has total number of 10404 features

(9119, 10404)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state =0)


In [17]:
clf = LinearSVC()
clf.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [18]:
y_pred = clf.predict(X_test)

In [19]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.93      0.93      1060
           1       0.91      0.91      0.91       764

    accuracy                           0.92      1824
   macro avg       0.92      0.92      0.92      1824
weighted avg       0.92      0.92      0.92      1824



In [20]:
x = 'no one Cares About me, I will die soon'
x = get_clean(x)
tsl = tfidf.transform([x])
clf.predict(tsl)

array([1])

In [21]:
x = 'I am the happiest person in the world'
x = get_clean(x)
tsl = tfidf.transform([x])
clf.predict(tsl)

array([0])

#Well done!!! You did it))))