In [1]:
from typing import List, Tuple, Callable, Dict
import random
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import torch.autograd
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score, classification_report, confusion_matrix)
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset
import contractions
import unicodedata
from bs4 import BeautifulSoup
import emoji
import re
import pickle
import os
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm

In [2]:
dataset_path = '../data/raw/suicidal_detection/Suicide_Detection.csv'

In [3]:
def lower_sentence(sentence: str) -> str:
    '''
    Lowercase the sentence.
    :param data: The sentence to lowercase.
    :return: The lowercased sentence
    :rtype: str
    '''
    return sentence.lower()

In [4]:
def remove_emails(sentence: str) -> str:
    '''
    Remove emails from the sentence.
    :param sentence: The sentence to remove emails from.
    :type sentence: str
    :return: The sentence without emails.
    :rtype: str
    '''
    return re.sub(r"\S*@\S*\s?", "", sentence)

In [5]:
def remove_nonascii_diacritic(sentence: str) -> str:
    '''

    Remove diacritics from the sentence.

    :param sentence: The sentence to remove diacritics from.

    :type sentence: str

    :return: The sentence without diacritics.

    :rtype: str
    '''

    return unicodedata.normalize("NFKD", sentence).encode("ascii", "ignore").decode("utf-8", "ignore")

In [6]:
def clean_html(sentence: str) -> str:
    '''
    Remove HTML tags from the sentence.
    :param sentence: The sentence to remove HTML tags from.
    :type sentence: str
    :return: The sentence without HTML tags.
    :rtype: str
    '''
    return BeautifulSoup(sentence, "html.parser").get_text()

In [7]:
def replace_repeated_chars(sentence: str) -> str:
    '''
    Replace repeated characters in the sentence.
    :param sentence: The sentence to replace repeated characters in.
    :type sentence: str
    :return: The sentence with replaced repeated characters.
    :rtype: str
    '''
    # Replace consecutive occurrences of ',', '!', '.', and '?' with a single occurrence
    return re.sub(r'([,!?.])\1+', r'\1', sentence)

In [8]:
def translate_emojis_to_text(sentence: str) -> str:
    '''
    Translate emojis in the sentence to text.
    :param sentence: The sentence to translate emojis to text.
    :type sentence: str
    :return: The sentence with translated emojis to text.
    :rtype: str
    '''
    # Translate emojis to text codes
    translated_text = emoji.demojize(sentence)
    # Remove colons from the translated text
    translated_text = re.sub(r':', '', translated_text)
    return translated_text

In [9]:
def expand_sentence(sentence: str) -> str:
    '''
    Expand the contractions in the sentence.
    :param sentence: The sentence to expand contractions in.
    :type sentence: str
    :return: The sentence with expanded contractions.
    :rtype: str
    '''
    return contractions.fix(sentence)

In [10]:
def remove_url(sentence: str) -> str:
    '''
    Remove URLs from the sentence.
    :param sentence: The sentence to remove URLs from.
    :type sentence: str
    :return: The sentence without URLs.
    :rtype: str
    '''
    return re.sub("((http\://|https\://|ftp\://)|(www.))+(([a-zA-Z0-9\.-]+\.[a-zA-Z]{2,4})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(/[a-zA-Z0-9%:/-_\?\.'~]*)?", '', sentence)

In [11]:
def remove_possessives(sentence: str) -> str:
    '''
    Strip possessives from the sentence.
    :param sentence: The sentence to strip possessives from.
    :type sentence: str
    :return: The sentence without possessives.
    :rtype: str
    '''
    # Stripping the possessives
    sentence = sentence.replace("'s", '')
    sentence = sentence.replace('’s', '')
    sentence = sentence.replace('s’', 's')
    sentence = sentence.replace("s'", 's')
    return sentence

In [12]:
def remove_extra_space(sentence: str) -> str:
    '''
    Remove extra spaces from the sentence.
    :param sentence: The sentence to remove extra spaces from.
    :type sentence: str
    :return: The sentence without extra spaces.
    :rtype: str
    '''
    return re.sub(r'\s+', ' ', sentence).strip()

In [13]:
def tokenize_sentence(sentence: str) -> list[str]:
    '''
    Tokenize the sentence.
    :param sentence: The sentence to tokenize.
    :type sentence: str
    :return: The tokenized sentence.
    :rtype: str
    '''
    return nltk.word_tokenize(sentence)

In [14]:
# import nltk
# nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
def remove_stop_words(sentence: list[str]) -> list[str]:
    '''
    Remove stop words from the sentence.
    :param sentence: The sentence to remove stop words from.
    :type sentence: list[str]
    :return: The sentence without stop words.
    :rtype: list[str]
    '''
    return [word for word in sentence if word not in stop_words]

In [15]:
lemmatizer = WordNetLemmatizer()
def lemm_sentence(sentence: list[str]) -> list[str]:
    '''
    Lemmatize the sentence.
    :param sentence: The sentence to lemmatize.
    :type sentence: list[str]
    :return: The lemmatized sentence.
    :rtype: list[str]
    '''
    # Perform POS tagging
    pos_tags = pos_tag(sentence)
    # Lemmatize each word based on its POS tag
    lemmatized_words = []
    for word, pos in pos_tags:
        # Map Penn Treebank POS tags to WordNet POS tags
        if pos.startswith('N'):  # Nouns
            pos = 'n'
        elif pos.startswith('V'):  # Verbs
            pos = 'v'
        elif pos.startswith('J'):  # Adjectives
            pos = 'a'
        elif pos.startswith('R'):  # Adverbs
            pos = 'r'
        else:
            pos = 'n'  # Default to noun if POS tag not found

        # Lemmatize the word using the appropriate POS tag
        lemma = lemmatizer.lemmatize(word, pos=pos)
        lemmatized_words.append(lemma)
    return lemmatized_words

In [16]:
def clean_train(line: str) -> list[str]:
    '''
    Clean the line and return it as a list of tokens
    :param line: the line to clean
    :type line: str
    :return: the cleaned line as a list of tokens
    :rtype: list
    '''
    # translate emojis
    line = translate_emojis_to_text(line)
    # lower the line
    line = lower_sentence(line)
    # remove non ascii
    line = remove_nonascii_diacritic(line)
    # remove emails
    line = remove_emails(line)
    # remove html
    line = clean_html(line)
    # remove urls
    line = remove_url(line)
    # replace repeated chars
    line = replace_repeated_chars(line)
    # expand
    line = expand_sentence(line)
    # remove possessives
    line = remove_possessives(line)
    # remove extra spaces
    line = remove_extra_space(line)
    # tekonize
    line = tokenize_sentence(line)
    # remove stopwords
    line = remove_stop_words(line)
    # lemmetization
    line = lemm_sentence(line)
    if len(line) == 0:
        return ['Normal']
    return line

In [17]:
# define the mapping from 's' to 1 and 'u' to 0
forward_label_mapping = {'suicide': 1, 'non-suicide': 0}

# define the reverse mapping from 0 to 'u' and 1 to 's'
reverse_label_mapping = {0: 'non-suicide', 1: 'suicide'}

In [18]:
# Read CSV file into DataFrame
df = pd.read_csv(dataset_path)
# Count unique values in each column
unique_counts = df.nunique()
# Display the count of unique values
print('Number of Unique Values:')
print(unique_counts)
# print the unique values of emotion
print('unique target classes', df['class'].unique())
# get the number of dialogue
num_Dialogues = len(df['class'])
# print the number of dialogues
print('Number of Dialogues: ', num_Dialogues)
print('First 5 elements: ')
print(df.head())

Number of Unique Values:
Unnamed: 0    232074
text          232074
class              2
dtype: int64
unique target classes ['suicide' 'non-suicide']
Number of Dialogues:  232074
First 5 elements: 
   Unnamed: 0                                               text        class
0           2  Ex Wife Threatening SuicideRecently I left my ...      suicide
1           3  Am I weird I don't get affected by compliments...  non-suicide
2           4  Finally 2020 is almost over... So I can never ...  non-suicide
3           8          i need helpjust help me im crying so hard      suicide
4           9  I’m so lostHello, my name is Adam (16) and I’v...      suicide


In [19]:
def read_dataset_suicidal_detection(data_path: str, split_seed: int) -> Tuple[Tuple[List[str], List[int]], Tuple[List[str], List[int]]]:
    '''
    Read the dataset for the suicidal detection task.
    :param data_path: The path to the dataset.
    :type data_path: str
    :param split_seed: The seed to use for splitting the dataset.
    :type split_seed: int
    :return: The training, and test sets.
    :rtype: Tuple[Tuple[List[str], List[int]], Tuple[List[str], List[int]]
    '''
    data = pd.read_csv(data_path)
    # extract the dialogues and their corresponding labels
    dialogues = data['text'].tolist()
    labels = data['class'].apply(lambda x: forward_label_mapping[x]).tolist()

    # split the data into training and temporary sets (70% training, 30% temporary)
    dialogues_train, dialogues_test, labels_train, labels_test = train_test_split(
        dialogues, labels, test_size=0.3, random_state=split_seed)


    return (dialogues_train, labels_train),  (dialogues_test, labels_test)

In [20]:
split_seed = 10
# read the data set
(dialogues_train, labels_train),  (dialogues_test,labels_test) = read_dataset_suicidal_detection(dataset_path, split_seed)
# print the number of dialogues in each set and percentage of total data
print(
    f"Number of dialogues in training set: {len(dialogues_train)} ({len(dialogues_train) / num_Dialogues:.2f}%)")
print(
    f"Number of dialogues in test set: {len(dialogues_test)} ({len(dialogues_test) / num_Dialogues:.2f}%)")

Number of dialogues in training set: 162451 (0.70%)
Number of dialogues in test set: 69623 (0.30%)


In [30]:
reverse_label_mapping = {0: 'non-suicide', 1: 'suicide'}

def save_data_to_csv(data: Tuple[List[str], List[int]], file_path: str):
        '''
        Save the data to a CSV file.
        :param data: The data to save.
        :type data: Tuple[List[str], List[int]]
        :param file_path: The path to the CSV file.
        :type file_path: str
        '''
        df = pd.DataFrame({'text': data[0], 'class': data[1]})
        df['class'] = df['class'].apply(lambda x: reverse_label_mapping[x])
        df.to_csv(file_path, index=False)

In [22]:
def clean_data(data: List[str]) -> List[List[str]]:
    '''
    Clean the data.
    :param data: The data to clean.
    :type data: List[str]
    :return: The cleaned data.
    :rtype: List[List[str]]
    '''
    cleaned_data = []
    for line in tqdm(data):
        cleaned_line = clean_train(line)
        cleaned_data.append(cleaned_line)
    return cleaned_data

In [23]:
dialogues_train_proccessed = clean_data(dialogues_train)
dialogues_test_proccessed = clean_data(dialogues_test)

  return BeautifulSoup(sentence, "html.parser").get_text()
100%|██████████| 162451/162451 [13:08<00:00, 205.94it/s] 
100%|██████████| 69623/69623 [06:26<00:00, 180.34it/s] 


In [24]:
print(dialogues_train_proccessed[0])
print(dialogues_train_proccessed[1])

['stupid', 'fuck', 'dumb', 'dude', '.', 'literally', 'thought', 'la', 'vega', 'new', 'vega', '7', 'year', '.', 'go', 'die']
['blame', 'familymy', 'family', 'lose', 'sister', 'law', 'suicide', '.', 'brother', 'best', 'friend', 'since', 'young', '(', 'know', '10+', 'year', ')', 'grow', 'old', 'become', 'couple', 'recently', 'child', 'turn', '1.', 'inseparable', '.', 'live', 'together', 'come', 'live', 'awhile', '.', 'become', 'close', 'used', 'see', 'together', 'everyday', '.', 'someone', 'always', 'smile', ',', 'love', 'son', 'brother', 'much', '.', 'recently', 'move', 'guess', 'something', 'happen', 'longer', 'together', '.', 'brother', 'expose', 'someone', 'covid-19', 'quarantine', '2', 'week', '.', 'go', 'house', 'want', 'get', 'son', 'sick', '.', 'make', 'post', 'know', 'father', 'choose', 'see', 'kid', '.', 'go', 'downhill', '.', 'know', 'together', 'post', 'would', 'share', 'fb', '.', 'sure', 'reason', ',', 'still', '.', 'however', ',', 'within', 'last', '2-3', 'day', 'suicide', '

In [31]:
flattened_X_train = [' '.join(tokens) for tokens in dialogues_train_proccessed]
flattened_X_test = [' '.join(tokens) for tokens in dialogues_test_proccessed]

save_data_to_csv((flattened_X_train, labels_train), '../data/processed/suicidal_detection_train.csv')
save_data_to_csv((flattened_X_test, labels_test), '../data/processed/suicidal_detection_test.csv')

In [25]:
print(flattened_X_train[0])
print(flattened_X_train[1])

stupid fuck dumb dude . literally thought la vega new vega 7 year . go die
blame familymy family lose sister law suicide . brother best friend since young ( know 10+ year ) grow old become couple recently child turn 1. inseparable . live together come live awhile . become close used see together everyday . someone always smile , love son brother much . recently move guess something happen longer together . brother expose someone covid-19 quarantine 2 week . go house want get son sick . make post know father choose see kid . go downhill . know together post would share fb . sure reason , still . however , within last 2-3 day suicide act weird , come 12am hold son could barely stand . next day mom take son guessing notice something right . come back house crash car say due someone chasing . say fine bumper . leave come back someone ( sure yet ) ? bring steal brother car night crash front light broken . say steal car ask show car . say remember . mom sit ask wrong , could tell u anything 

In [26]:
# from sklearn.pipeline import Pipeline
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.linear_model import LogisticRegression
# # Compute class weights

# pipe_lr = Pipeline(steps=[('cv',CountVectorizer()),('lr',LogisticRegression())])
# pipe_lr.fit(flattened_X_train,labels_train)

# y_pred = pipe_lr.predict(flattened_X_test)
# report = classification_report(labels_test, y_pred)
# print(report)
X_train = flattened_X_train
X_test = flattened_X_test

y_train = labels_train
y_test = labels_test


In [27]:
# # Example of a new sentence
# new_sentence = "suicide"

# preprocessed_tokens = clean_train(new_sentence)
# preprocessed_text = ' '.join(preprocessed_tokens)

# predicted_label = pipe_lr.predict([preprocessed_text])

# print("Predicted label:", reverse_label_mapping[predicted_label[0]])


In [28]:
# pip install gensim

In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import classification_report, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from tqdm import tqdm
from typing import List

# Encode labels
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

# Define classifiers
classifiers = {
    'Naive Bayes': MultinomialNB(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': xgb.XGBClassifier(),
    'Logistic Regression': LogisticRegression()
}

# Define vectorizers
vectorizers = {
    # 'Bag of Words': CountVectorizer(),
    # 'Unigram': CountVectorizer(ngram_range=(1, 1)),
    # 'N-Gram': CountVectorizer(ngram_range=(1, 2)),
    # 'CharLevel': CountVectorizer(analyzer='char'),
    'Count Vectorizer': CountVectorizer(),
    'tf-idf': TfidfVectorizer()
}

# Prepare results storage
results = []

# Evaluate all combinations
for vec_name, vectorizer in vectorizers.items():
    for clf_name, clf in classifiers.items():
        steps = [('vectorizer', vectorizer)]
        
        if clf_name in ['SVM', 'KNN']:
            steps.append(('svd', TruncatedSVD(n_components=100)))
        
        steps.append(('classifier', clf))
        
        # Create a pipeline
        pipe = Pipeline(steps)
        
        # Fit the model
        pipe.fit(X_train, y_train_enc)

        # Predict
        y_pred = pipe.predict(X_test)

        # Evaluate
        precision, recall, _, _ = precision_recall_fscore_support(y_test_enc, y_pred, average='weighted')
        
        # Store results
        results.append((vec_name, clf_name, precision, recall))

        # Print classification report
        print(f"Feature: {vec_name}, Classifier: {clf_name}")
        print(classification_report(y_test_enc, y_pred))
        print()

# Find the best combination
best_combination = max(results, key=lambda x: (x[2], x[3]))  # Based on precision and recall
print(f"Best combination: Feature: {best_combination[0]}, Classifier: {best_combination[1]}, Precision: {best_combination[2]}, Recall: {best_combination[3]}")


Feature: Count Vectorizer, Classifier: Naive Bayes
              precision    recall  f1-score   support

           0       0.96      0.82      0.89     34954
           1       0.84      0.97      0.90     34669

    accuracy                           0.89     69623
   macro avg       0.90      0.90      0.89     69623
weighted avg       0.90      0.89      0.89     69623


Feature: Count Vectorizer, Classifier: SVM
              precision    recall  f1-score   support

           0       0.85      0.93      0.89     34954
           1       0.92      0.84      0.88     34669

    accuracy                           0.88     69623
   macro avg       0.89      0.88      0.88     69623
weighted avg       0.89      0.88      0.88     69623


Feature: Count Vectorizer, Classifier: KNN
              precision    recall  f1-score   support

           0       0.76      0.92      0.83     34954
           1       0.90      0.72      0.80     34669

    accuracy                           0.82

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Feature: Count Vectorizer, Classifier: Logistic Regression
              precision    recall  f1-score   support

           0       0.91      0.95      0.93     34954
           1       0.95      0.91      0.93     34669

    accuracy                           0.93     69623
   macro avg       0.93      0.93      0.93     69623
weighted avg       0.93      0.93      0.93     69623


Feature: tf-idf, Classifier: Naive Bayes
              precision    recall  f1-score   support

           0       0.97      0.79      0.87     34954
           1       0.82      0.98      0.89     34669

    accuracy                           0.88     69623
   macro avg       0.89      0.88      0.88     69623
weighted avg       0.89      0.88      0.88     69623


Feature: tf-idf, Classifier: SVM
              precision    recall  f1-score   support

           0       0.91      0.93      0.92     34954
           1       0.93      0.91      0.92     34669

    accuracy                           0.92    

In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import classification_report, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from tqdm import tqdm
from typing import List

# Encode labels
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

# Define classifiers
classifiers = {
    'Naive Bayes': MultinomialNB(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': xgb.XGBClassifier(),
    'Logistic Regression': LogisticRegression()
}

# Define vectorizers
vectorizers = {
    'Bag of Words': CountVectorizer(),
    'Unigram': CountVectorizer(ngram_range=(1, 1)),
    # 'N-Gram': CountVectorizer(ngram_range=(1, 2)),
    'CharLevel': CountVectorizer(analyzer='char'),
    'Count Vectorizer': CountVectorizer(),
    'tf-idf': TfidfVectorizer()
}

# Prepare results storage
results = []

# Evaluate all combinations
for vec_name, vectorizer in vectorizers.items():
    for clf_name, clf in classifiers.items():
        steps = [('vectorizer', vectorizer)]
        
        if clf_name in ['SVM', 'KNN']:
            steps.append(('svd', TruncatedSVD(n_components=100)))
        
        steps.append(('classifier', clf))
        
        # Create a pipeline
        pipe = Pipeline(steps)
        
        # Fit the model
        pipe.fit(X_train, y_train_enc)

        # Predict
        y_pred = pipe.predict(X_test)

        # Evaluate
        precision, recall, _, _ = precision_recall_fscore_support(y_test_enc, y_pred, average='weighted')
        
        # Store results
        results.append((vec_name, clf_name, precision, recall))

        # Print classification report
        print(f"Feature: {vec_name}, Classifier: {clf_name}")
        print(classification_report(y_test_enc, y_pred))
        print()

# Find the best combination
best_combination = max(results, key=lambda x: (x[2], x[3]))  # Based on precision and recall
print(f"Best combination: Feature: {best_combination[0]}, Classifier: {best_combination[1]}, Precision: {best_combination[2]}, Recall: {best_combination[3]}")


Feature: Bag of Words, Classifier: Naive Bayes
              precision    recall  f1-score   support

           0       0.96      0.82      0.89     34954
           1       0.84      0.97      0.90     34669

    accuracy                           0.89     69623
   macro avg       0.90      0.90      0.89     69623
weighted avg       0.90      0.89      0.89     69623


Feature: Bag of Words, Classifier: SVM
              precision    recall  f1-score   support

           0       0.85      0.93      0.89     34954
           1       0.92      0.84      0.88     34669

    accuracy                           0.88     69623
   macro avg       0.89      0.88      0.88     69623
weighted avg       0.89      0.88      0.88     69623


Feature: Bag of Words, Classifier: KNN
              precision    recall  f1-score   support

           0       0.76      0.92      0.83     34954
           1       0.90      0.71      0.79     34669

    accuracy                           0.82     69623
 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Feature: Bag of Words, Classifier: Logistic Regression
              precision    recall  f1-score   support

           0       0.91      0.95      0.93     34954
           1       0.95      0.91      0.93     34669

    accuracy                           0.93     69623
   macro avg       0.93      0.93      0.93     69623
weighted avg       0.93      0.93      0.93     69623


Feature: Unigram, Classifier: Naive Bayes
              precision    recall  f1-score   support

           0       0.96      0.82      0.89     34954
           1       0.84      0.97      0.90     34669

    accuracy                           0.89     69623
   macro avg       0.90      0.90      0.89     69623
weighted avg       0.90      0.89      0.89     69623


Feature: Unigram, Classifier: SVM
              precision    recall  f1-score   support

           0       0.85      0.93      0.89     34954
           1       0.92      0.84      0.88     34669

    accuracy                           0.88     6

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Feature: Unigram, Classifier: Logistic Regression
              precision    recall  f1-score   support

           0       0.91      0.95      0.93     34954
           1       0.95      0.91      0.93     34669

    accuracy                           0.93     69623
   macro avg       0.93      0.93      0.93     69623
weighted avg       0.93      0.93      0.93     69623


Feature: CharLevel, Classifier: Naive Bayes
              precision    recall  f1-score   support

           0       0.85      0.38      0.52     34954
           1       0.60      0.93      0.73     34669

    accuracy                           0.65     69623
   macro avg       0.72      0.66      0.63     69623
weighted avg       0.72      0.65      0.63     69623




ValueError: n_components(100) must be <= n_features(89).