# 📌 1. Data import

In [1]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)

df = pd.read_csv("../data/Suicide_Detection.csv")
df = df.drop(columns=["Unnamed: 0"])
df = df.rename(columns={"text": "Text", "class": "Classification"})
df

Unnamed: 0,Text,Classification
0,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,Am I weird I don't get affected by compliments...,non-suicide
2,Finally 2020 is almost over... So I can never ...,non-suicide
3,i need helpjust help me im crying so hard,suicide
4,"I’m so lostHello, my name is Adam (16) and I’v...",suicide
...,...,...
232069,If you don't like rock then your not going to ...,non-suicide
232070,You how you can tell i have so many friends an...,non-suicide
232071,pee probably tastes like salty tea😏💦‼️ can som...,non-suicide
232072,The usual stuff you find hereI'm not posting t...,suicide


# 📌 2. Data check

In [2]:
df.isnull().sum()

Text              0
Classification    0
dtype: int64

In [3]:
df.Classification.value_counts()

Classification
suicide        116037
non-suicide    116037
Name: count, dtype: int64

# 📌 3. Processing data

In [4]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df.Classification = le.fit_transform(df.Classification)
df.Classification

0         1
1         0
2         0
3         1
4         1
         ..
232069    0
232070    0
232071    0
232072    1
232073    0
Name: Classification, Length: 232074, dtype: int64

# 📌 4.TfidfVectorizer

In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from joblib import dump, load


df = df.dropna()
vectorizer = TfidfVectorizer(max_features=5000)

X = vectorizer.fit_transform(df["Text"])
vocabulary = vectorizer.vocabulary_
y = df.Classification
X = X.toarray()

# 📌 5.Split data

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


# 📌 6. Model & Prediction

In [7]:
import tensorflow as tf
from sklearn.metrics import accuracy_score


model = tf.keras.models.Sequential(
    [
        tf.keras.layers.Dense(128, activation="gelu", input_shape=(X.shape[1],)),
        tf.keras.layers.Dense(64, activation="gelu"),
        tf.keras.layers.Dense(
            1, activation="sigmoid"
        ), # sigmoid because we have only 2 classes
    ]
)

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

model.fit(X_train, y_train, epochs=5, batch_size=32, shuffle=True)
y_pred = model.predict(X_test)
y_pred = np.round(y_pred)
score = accuracy_score(y_pred, y_test)
print(f"--------------------------------------\nAccuracy Score: {score:.2f}")

2023-12-15 11:47:03.460035: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
--------------------------------------
Accuracy Score: 0.94


# 📌 7. Save Model

In [8]:
import pickle

pickle.dump(model, open("../model/model.pkl", "wb"))
pickle.dump(vocabulary, open("../model/vectorizer_obj.pkl", "wb"))

# 📌 8. Test Model

In [9]:
THRESHOLD = 0.75

In [10]:
from scipy.sparse import spmatrix

# Load the classifier and vectorizer
classifier = pickle.load(open("../model/model.pkl", "rb"))
vocabulary = pickle.load(open("../model/vectorizer_obj.pkl", "rb"))

vectorizer2 = TfidfVectorizer(vocabulary=vocabulary)

# Define new input
new_input = [
    "I hate my life lmao I hope I die soon or sumn I'm too tired of everything",
    "i wish i wasn’t to lazy for suicide.i can’t handle this life anymore and i want it to end. but that requires the energy i don’t have.. i don’t care.. if laziness could kill me i’d be happy. anyways, i have nothing else to say.",
    "I love dogs, dogs are really nice animals.",
]

# Transform the new input using the loaded vectorizer
to_pred_dense = vectorizer2.fit_transform(new_input).toarray()


# Make predictions on the new input
new_input_pred = classifier.predict(to_pred_dense)

new_input_pred = np.round(new_input_pred)
print("Prediction for the new input:", new_input_pred)

Prediction for the new input: [[1.]
 [1.]
 [0.]]


In [11]:
import re

df_covid = pd.read_csv("../data/covid19_tweets.csv").dropna()

# Assuming you have already loaded the DataFrame df_covid
new_input = df_covid["text"]

# Remove Emails
data = [re.sub("\S*@\S*\s?", "", sent) for sent in new_input]

# Remove Links
data = [re.sub("https?://\S+|www\.\S+", "", sent) for sent in data]

# Remove Hashtags
data = [re.sub("/#\w+\s*/", "", sent) for sent in data]

# Remove new line characters
data = [re.sub("\s+", " ", sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("'", "", sent) for sent in data]

# Transform the new input using the loaded vectorizer
to_pred_dense = vectorizer2.transform(data).toarray()

# Make predictions on the new input
new_input_pred = classifier.predict(to_pred_dense)


# Add the predictions as a new column to the DataFrame
df_covid["prediction"] = new_input_pred

# Display only rows where the prediction is equal to 1
filtered_df = df_covid[df_covid["prediction"] > THRESHOLD][["text", "prediction"]]

# Print the filtered DataFrame
display([s for s in filtered_df["text"].values if len(s) >= 50])

# Print the number of tweets that were filtered
print("Number of tweets filtered:", len(filtered_df))

KeyboardInterrupt: 

In [None]:
df_covid = pd.read_csv("../data/captions.csv").dropna()

# Assuming you have already loaded the DataFrame df_covid
new_input = df_covid["Caption"]

# Remove Emails
data = [re.sub("\S*@\S*\s?", "", sent) for sent in new_input]

# Remove Links
data = [re.sub("https?://\S+|www\.\S+", "", sent) for sent in data]

# Remove Hashtags
data = [re.sub("/#\w+\s*/", "", sent) for sent in data]

# Remove new line characters
data = [re.sub("\s+", " ", sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("'", "", sent) for sent in data]

# Transform the new input using the loaded vectorizer
to_pred_dense = vectorizer2.transform(data).toarray()

# Make predictions on the new input
new_input_pred = classifier.predict(to_pred_dense)
new_input_pred = np.round(new_input_pred)

# Add the predictions as a new column to the DataFrame
df_covid["prediction"] = new_input_pred

# Display only rows where the prediction is equal to 1
filtered_df = df_covid[df_covid["prediction"] > THRESHOLD][["Caption", "prediction"]]

# Print the filtered DataFrame
display([s for s in filtered_df["Caption"].values if len(s) >= 50])

# Print the number of tweets that were filtered
print("Number of tweets filtered:", len(filtered_df))



['After a whole summer of being roommates, Anna, Fox and Freddy have gone home and I’m missing them already. I get to live life with the best people. I’m pretty freaking lucky. ',
 'Kids in cages! Sleeping on concrete floors with aluminum blankets! No access to simple dignities! How is this still happening??? It’s absolutely inhumane to treat anyone like this let alone children. I can’t even imagine what they are going through. We need to get this to finally stop! Don’t stay silent on this human rights issue- please call your reps 202.224.3121 ',
 'To see what is happening right now in Alabama, Georgia, Mississippi and several other states in our country is not only deeply upsetting but seems that it can’t possibly be real in 2019. It’s no one’s business what a woman chooses to do with her body. End of story. If you want to help fight this, click on the link in my bio to find out how you can volunteer or make a donation. ',
 'Lots to look forward to in 2019. I can’t wait to share the p

Number of tweets filtered: 1082


In [None]:
df_covid = pd.read_excel(
    "../data/Depression  Anxiety Facebook page Comments Text.xlsx"
).dropna()

# Assuming you have already loaded the DataFrame df_covid
new_input = df_covid["Comments Text"]

# Transform the new input using the loaded vectorizer
to_pred_dense = vectorizer2.transform(new_input).toarray()

# Make predictions on the new input
new_input_pred = classifier.predict(to_pred_dense)


# Add the predictions as a new column to the DataFrame
df_covid["prediction"] = new_input_pred


# Display only rows where the prediction is equal to 1
filtered_df = df_covid[df_covid["prediction"] > THRESHOLD][
    ["Comments Text", "prediction"]
]

# Print the filtered DataFrame
display([s for s in filtered_df["Comments Text"].values if len(s) >= 50])

# Print the number of tweets that were filtered
print("Number of tweets filtered:", len(filtered_df))



['In addition, people with BPD quite often have a weak self concept. They don\'t always know who they really are deep down inside. They may always feel like there\'s something missing or something wrong with them intrinsically. This can cause periods of dissociation and difficulty in maintaining a healthy sense of self esteem. They may have a weak value system (or one that is missing entirely) or find that they have a difficult time adhering to whatever value system they do have. Since there\'s a real issue with trusting and black/white thinking, they don\'t always trust in their own decisions and may do things to get people to "like" them or not leave them that would be against their value system normally, and think that they (themselves) are terrible/bad/evil people for doing it later. They often take their mistakes in life much more seriously than other people. A "normal" person may make the same mistake and feel bad about it, but it\'s unlikely that they will see themselves as terr

Number of tweets filtered: 2053


In [None]:
filtered_df = df_covid[df_covid["prediction"] < THRESHOLD][
    ["Comments Text", "prediction"]
]
display([s for s in filtered_df["Comments Text"].values if len(s) >= 50])

['So, when you ask what the two illnesses are...they\'re similar in that they tend to have moodiness involved, impulsivity and self-damaging behaviors for compensatory measures. Otherwise, they\'re not really that similar, but they do seem to "like" one another and often show up in the same person...making their lives chaotic, difficult, dysfunctional, and intense. \n\nSo, if your friend is concerned that they may have one (or both) of these illnesses, they should not be ashamed. It\'s definitely not their fault. They should, instead, get help from a professional...and INSIST that they are treated appropriately with the correct types of therapy and medication.',
 'Borderline Personality Disorder, like all other personality disorders, is an abnormal way of perceiving and interacting with the world. Think of it as having a dirty filter that the sufferer sees the world through constantly. We do not believe it is caused by chemical imbalances in the brain, but it\'s possible that some of t

In [None]:
{
    "text_list": [
        "I hate my life lmao I hope I die soon or sumn I'm too tired of everything",
        "i wish i wasn’t to lazy for suicide.i can’t handle this life anymore and i want it to end. but that requires the energy i don’t have.. i don’t care.. if laziness could kill me i’d be happy. anyways, i have nothing else to say.",
        "I love dogs, dogs are really nice animals.",
    ],
    "threshold": 0.7,
}