In [11]:
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [2]:
data_cleaned = pd.read_csv("../raw_data/Suicide_Detection_cleaned.csv")
data_cleaned

Unnamed: 0,text_cleaned,target
0,ex wife threatening suicide recently left wife...,1
1,weird get affected compliment coming someone k...,0
2,finally almost never hear bad year ever swear ...,0
3,need helpjust help im cry hard,1
4,lost hello name adam struggling year afraid pa...,1
...,...,...
231559,like rock going get anything go http musictast...,0
231560,tell many friend lonely everything deprived pr...,0
231561,pee probably taste like salty tea someone dran...,0
231562,usual stuff find posting sympathy pity know fa...,1


In [5]:
texts = data_cleaned.text_cleaned.tolist()
texts

['ex wife threatening suicide recently left wife good cheated twice lied much decided refuse go back day ago began threatening suicide tirelessly spent paat day talking keep hesitating want believe come back know lot people threaten order get way happens really supposed handle death hand still love wife deal getting cheated constantly feeling insecure worried today may day hope much happen',
 'weird get affected compliment coming someone know irl feel really good internet stranger',
 'finally almost never hear bad year ever swear fucking god annoying',
 'need helpjust help im cry hard',
 'lost hello name adam struggling year afraid past year thought suicide fear anxiety close limit quiet long scared come family feeling year ago losing aunt triggered everyday feeling hopeless lost guilty remorseful thing done life thought like little experienced life time revealed feeling family broke saw cut watching get worried something portrayed average day made feel absolutely dreadful later found 

In [8]:
tf_idf_vectorizer = TfidfVectorizer(max_features=3000)

# Training it on the texts
weighted_words = pd.DataFrame(tf_idf_vectorizer.fit_transform(texts).toarray(),
                 columns = tf_idf_vectorizer.get_feature_names_out())

weighted_words

Unnamed: 0,abandon,abandoned,ability,able,abortion,abroad,absolute,absolutely,abt,abuse,...,younger,youre,youth,youtu,youtube,yr,zero,zombie,zone,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.119863,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231559,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
231560,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
231561,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
231562,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Feature/Target
X = data_cleaned["text_cleaned"]
y = data_cleaned["target"]

# Pipeline vectorizer + Naive Bayes
pipeline_naive_bayes = make_pipeline(
    TfidfVectorizer(max_features=3000),
    MultinomialNB()
)

# Cross-validation
cv_results = cross_validate(pipeline_naive_bayes,
                            X,
                            y,
                            cv = 5,
                            scoring = ["accuracy"],
                            verbose=2)
cv_results


[CV] END .................................................... total time=  23.0s
[CV] END .................................................... total time=  16.3s
[CV] END .................................................... total time=  16.2s
[CV] END .................................................... total time=  16.1s
[CV] END .................................................... total time=  18.0s


{'fit_time': array([19.75059414, 12.81647301, 12.91301608, 12.79738808, 14.71475291]),
 'score_time': array([3.23596787, 3.46778989, 3.26312995, 3.25895333, 3.29296803]),
 'test_accuracy': array([0.88733185, 0.89158552, 0.88890808, 0.8916287 , 0.89082743])}

In [12]:
# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [27]:
param_grid = {
    'tfidfvectorizer__ngram_range': [(1, 1), (1, 2)],
    'multinomialnb__alpha': [0.1, 1]
}


In [28]:
gridsearch = GridSearchCV(pipeline_naive_bayes,
                          param_grid = param_grid,
                          scoring = "accuracy",
                          cv = 5,
                          n_jobs=-1,
                          verbose=2
                          )

In [29]:
gridsearch.fit(X_train,y_train)

# Best score
print(f"Best Score = {gridsearch.best_score_}")

# Best params
print(f"Best params = {gridsearch.best_params_}")

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END multinomialnb__alpha=0.1, tfidfvectorizer__max_features=3000, tfidfvectorizer__ngram_range=(1, 1); total time=  21.9s
[CV] END multinomialnb__alpha=0.1, tfidfvectorizer__max_features=3000, tfidfvectorizer__ngram_range=(1, 1); total time=  22.0s
[CV] END multinomialnb__alpha=0.1, tfidfvectorizer__max_features=3000, tfidfvectorizer__ngram_range=(1, 1); total time=  22.2s
[CV] END multinomialnb__alpha=0.1, tfidfvectorizer__max_features=3000, tfidfvectorizer__ngram_range=(1, 1); total time=  22.4s
[CV] END multinomialnb__alpha=0.1, tfidfvectorizer__max_features=3000, tfidfvectorizer__ngram_range=(1, 1); total time=  20.9s
[CV] END multinomialnb__alpha=0.1, tfidfvectorizer__max_features=3000, tfidfvectorizer__ngram_range=(1, 2); total time= 1.1min
[CV] END multinomialnb__alpha=0.1, tfidfvectorizer__max_features=3000, tfidfvectorizer__ngram_range=(1, 2); total time= 1.1min
[CV] END multinomialnb__alpha=0.1, tfidfvectorizer_