## Library Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import string
from nltk.corpus import stopwords
import re
from sklearn.model_selection import train_test_split
import plotly.express as plx
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('Suicide ideation.csv')

In [3]:
data.head(10)

Unnamed: 0.1,Unnamed: 0,text,class
0,2,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,3,Am I weird I don't get affected by compliments...,non-suicide
2,4,Finally 2020 is almost over... So I can never ...,non-suicide
3,8,i need helpjust help me im crying so hard,suicide
4,9,"I’m so lostHello, my name is Adam (16) and I’v...",suicide
5,11,Honetly idkI dont know what im even doing here...,suicide
6,12,[Trigger warning] Excuse for self inflicted bu...,suicide
7,13,It ends tonight.I can’t do it anymore. \nI quit.,suicide
8,16,"Everyone wants to be ""edgy"" and it's making me...",non-suicide
9,18,My life is over at 20 years oldHello all. I am...,suicide


In [4]:
# Digitize text attributes
data['target'] = data['class'].apply(lambda x: 1 if x == 'suicide' else 0)
# rename the first column and set it to an orders series starting at 0
data = data.rename(columns={'Unnamed: 0': 'number'})
data['number'] = range(len(data))

In [5]:
data.head(10)

Unnamed: 0,number,text,class,target
0,0,Ex Wife Threatening SuicideRecently I left my ...,suicide,1
1,1,Am I weird I don't get affected by compliments...,non-suicide,0
2,2,Finally 2020 is almost over... So I can never ...,non-suicide,0
3,3,i need helpjust help me im crying so hard,suicide,1
4,4,"I’m so lostHello, my name is Adam (16) and I’v...",suicide,1
5,5,Honetly idkI dont know what im even doing here...,suicide,1
6,6,[Trigger warning] Excuse for self inflicted bu...,suicide,1
7,7,It ends tonight.I can’t do it anymore. \nI quit.,suicide,1
8,8,"Everyone wants to be ""edgy"" and it's making me...",non-suicide,0
9,9,My life is over at 20 years oldHello all. I am...,suicide,1


In [6]:
data['class'].value_counts()

class
suicide        116037
non-suicide    116037
Name: count, dtype: int64

In [7]:
data['class'].value_counts().index.values

array(['suicide', 'non-suicide'], dtype=object)

## Data Cleansing

In [8]:
# version 2
def preprocess_data(text):
    text_punc_removed = [word for word in text if word not in string.punctuation]
    text_joined = ''.join(text_punc_removed)
    text_preprocessed = [word for word in text_joined.split(' ') if word not in stopwords.words('english')]
    return text_preprocessed

### Bag of Words model

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer(analyzer=preprocess_data)
text_vectorizer = vectorizer.fit_transform(data['text'])
print(vectorizer.get_feature_names())

In [11]:
text_vectorizer.shape

(232074, 454217)

In [12]:
from sklearn.decomposition import TruncatedSVD

# Set the target dimension
target_dimension = 100

# Create TruncatedSVD for dimensionality reduction
svd = TruncatedSVD(n_components=target_dimension)
text_vectorizer_reduced = svd.fit_transform(text_vectorizer)

# Print the shape
print(text_vectorizer_reduced.shape)

(232074, 100)


In [13]:
type(text_vectorizer_reduced)

numpy.ndarray

In [14]:
label = data['target']
label.shape

(232074,)

### TF-IDF

In [None]:
# Create the TF-IDF vectorizer
vectorizer_tf = TfidfVectorizer(analyzer=preprocess_data)
text_vectorizer_tf = vectorizer_tf.fit_transform(data['text'])
print(vectorizer_tf.get_feature_names())

In [34]:
text_vectorizer_tf.shape

(232074, 454217)

In [35]:
# Set the target dimension
target_dimension = 100

# Create TruncatedSVD for dimensionality reduction
svd = TruncatedSVD(n_components=target_dimension)
text_vectorizer_tf_reduced = svd.fit_transform(text_vectorizer_tf)

# Print the shape
print(text_vectorizer_tf_reduced.shape)

(232074, 100)


## Training the model and model prediction
    · Bag of Words

In [15]:
# split the train and test dataset
X = text_vectorizer_reduced
y = label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [20]:
import time
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# Instantiate the models
LR = LogisticRegression(max_iter=5000)
DTC = DecisionTreeClassifier()
RFC = RandomForestClassifier()
NB = GaussianNB()

# Create a dictionary to store the model names and their corresponding fit times
fit_times = {}

# Create a dictionary to store the model names and their corresponding cross-validation scores
cv_scores = {}

# Perform cross-validation and record fit times and scores for each model
start_time = time.time()
cv_scores['LogisticRegression'] = cross_val_score(LR, X_train, y_train, cv=5)
fit_times['LogisticRegression'] = time.time() - start_time

start_time = time.time()
cv_scores['DecisionTreeClassifier'] = cross_val_score(DTC, X_train, y_train, cv=5)
fit_times['DecisionTreeClassifier'] = time.time() - start_time

start_time = time.time()
cv_scores['RandomForestClassifier'] = cross_val_score(RFC, X_train, y_train, cv=5)
fit_times['RandomForestClassifier'] = time.time() - start_time

start_time = time.time()
cv_scores['GaussianNB'] = cross_val_score(NB, X_train, y_train, cv=5)
fit_times['GaussianNB'] = time.time() - start_time

# Print the fit times and cross-validation scores
for model, time_taken in fit_times.items():
    print(f"{model} took {time_taken:.2f} seconds to fit.")

for model, scores in cv_scores.items():
    print(f"{model} cross-validation scores: {scores}")
    print(f"Mean cross-validation score: {scores.mean():.2f}")

LogisticRegression took 503.68 seconds to fit.
DecisionTreeClassifier took 140.80 seconds to fit.
RandomForestClassifier took 812.34 seconds to fit.
GaussianNB took 1.52 seconds to fit.
LogisticRegression cross-validation scores: [0.86394485 0.86362167 0.85966282 0.86119789 0.86501845]
Mean cross-validation score: 0.86
DecisionTreeClassifier cross-validation scores: [0.77458796 0.7721911  0.7688786  0.77146397 0.774932  ]
Mean cross-validation score: 0.77
RandomForestClassifier cross-validation scores: [0.85446515 0.85260692 0.85287623 0.85430357 0.85852791]
Mean cross-validation score: 0.85
GaussianNB cross-validation scores: [0.51656253 0.51677798 0.51699343 0.51745125 0.51733053]
Mean cross-validation score: 0.52


In [21]:
from sklearn.metrics import accuracy_score, classification_report

models = [('Logistic Regression', LR), ('Decision Tree', DTC), ('Random Forest', RFC), ('Gaussian NB', NB)]

for model_name, model in models:
    # Fit the model on the training data
    model.fit(X_train, y_train)
    
    # Predict and evaluate the model on the test set
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    report = classification_report(y_test, predictions)

    # Print the evaluation metrics
    print(f"Model: {model_name}")
    print("Accuracy:", accuracy)
    print("Classification Report:")
    print(report)
    print("\n")


Model: Logistic Regression
Accuracy: 0.8615749219002478
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.93      0.87     23209
           1       0.92      0.80      0.85     23206

    accuracy                           0.86     46415
   macro avg       0.87      0.86      0.86     46415
weighted avg       0.87      0.86      0.86     46415



Model: Decision Tree
Accuracy: 0.7746202736184423
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.77      0.77     23209
           1       0.77      0.78      0.78     23206

    accuracy                           0.77     46415
   macro avg       0.77      0.77      0.77     46415
weighted avg       0.77      0.77      0.77     46415



Model: Random Forest
Accuracy: 0.8561456425724443
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.87      0.86     23209
        

    · TF-IDF

In [37]:
# split the train and test dataset for TF-IDF
X_tf = text_vectorizer_tf_reduced
y_tf = label
X_train, X_test, y_train, y_test = train_test_split(X_tf, y_tf, test_size=0.2, random_state=10)

In [38]:
# Instantiate the models
LR = LogisticRegression(max_iter=5000)
DTC = DecisionTreeClassifier()
RFC = RandomForestClassifier()
NB = GaussianNB()

fit_times = {}
cv_scores = {}

# Perform cross-validation and record fit times and scores for each model
start_time = time.time()
cv_scores['LogisticRegression'] = cross_val_score(LR, X_train, y_train, cv=5)
fit_times['LogisticRegression'] = time.time() - start_time

start_time = time.time()
cv_scores['DecisionTreeClassifier'] = cross_val_score(DTC, X_train, y_train, cv=5)
fit_times['DecisionTreeClassifier'] = time.time() - start_time

start_time = time.time()
cv_scores['RandomForestClassifier'] = cross_val_score(RFC, X_train, y_train, cv=5)
fit_times['RandomForestClassifier'] = time.time() - start_time

start_time = time.time()
cv_scores['GaussianNB'] = cross_val_score(NB, X_train, y_train, cv=5)
fit_times['GaussianNB'] = time.time() - start_time

# Print the fit times and cross-validation scores
for model, time_taken in fit_times.items():
    print(f"{model} using TF-IDF took {time_taken:.2f} seconds to fit.")

for model, scores in cv_scores.items():
    print(f"{model} using TF-IDF cross-validation scores: {scores}")
    print(f"Mean cross-validation score: {scores.mean():.2f}")

LogisticRegression using TF-IDF took 18.72 seconds to fit.
DecisionTreeClassifier using TF-IDF took 162.93 seconds to fit.
RandomForestClassifier using TF-IDF took 837.37 seconds to fit.
GaussianNB using TF-IDF took 1.49 seconds to fit.
LogisticRegression using TF-IDF cross-validation scores: [0.90450285 0.90412582 0.90264462 0.90571475 0.90643936]
Mean cross-validation score: 0.90
DecisionTreeClassifier using TF-IDF cross-validation scores: [0.81266832 0.81124098 0.80919423 0.81398793 0.8143869 ]
Mean cross-validation score: 0.81
RandomForestClassifier using TF-IDF cross-validation scores: [0.88640526 0.88317354 0.88452009 0.88562426 0.88831435]
Mean cross-validation score: 0.89
GaussianNB using TF-IDF cross-validation scores: [0.7755844  0.77065604 0.76281913 0.76968652 0.77372007]
Mean cross-validation score: 0.77


In [39]:
models = [('Logistic Regression', LR), ('Decision Tree', DTC), ('Random Forest', RFC), ('Gaussian NB', NB)]

for model_name, model in models:
    # Fit the model on the training data
    model.fit(X_train, y_train)
    
    # Predict and evaluate the model on the test set
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    report = classification_report(y_test, predictions)

    # Print the evaluation metrics
    print(f"Model: {model_name}")
    print("Accuracy:", accuracy)
    print("Classification Report:")
    print(report)
    print("\n")


Model: Logistic Regression
Accuracy: 0.9037595604869115
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.93      0.91     23209
           1       0.92      0.88      0.90     23206

    accuracy                           0.90     46415
   macro avg       0.90      0.90      0.90     46415
weighted avg       0.90      0.90      0.90     46415



Model: Decision Tree
Accuracy: 0.8162447484649359
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.82      0.82     23209
           1       0.82      0.82      0.82     23206

    accuracy                           0.82     46415
   macro avg       0.82      0.82      0.82     46415
weighted avg       0.82      0.82      0.82     46415



Model: Random Forest
Accuracy: 0.8868900140040935
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.89      0.89     23209
        