In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
import pandas as pd
import utils as utils

In [2]:
train_path = '../data/train.csv'

In [11]:
df = pd.read_csv(train_path)
df['label'] = (df['label'] == 'self.SuicideWatch').astype(int)
df.describe()

Unnamed: 0,label
count,45706.0
mean,0.187459
std,0.390284
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [13]:
df_majority = df[df['label'] == 0]
df_minority = df[df['label'] == 1]

In [19]:
df_minority_upsampled = resample(df_minority, 
                                 replace=True,
                                 n_samples=len(df_majority),
                                 random_state=42)

df_majority_downsampled = resample(df_majority,
                                    replace=False,
                                    n_samples=len(df_minority),
                                    random_state=42)

In [20]:
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
df_upsampled = df_upsampled.sample(frac=1, random_state=42)

df_downsampled = pd.concat([df_majority_downsampled, df_minority])
df_downsampled = df_downsampled.sample(frac=1, random_state=42)

In [21]:
print(df_upsampled['label'].value_counts())
print(df_downsampled['label'].value_counts())

label
1    37138
0    37138
Name: count, dtype: int64
label
0    8568
1    8568
Name: count, dtype: int64


In [22]:
tfidf_upsampled = TfidfVectorizer(max_features=10000)
X_upsampled = tfidf_upsampled.fit_transform(df_upsampled['text'])

tfidf_downsampled = TfidfVectorizer(max_features=10000)
X_downsampled = tfidf_downsampled.fit_transform(df_downsampled['text'])

In [23]:
X_train_upsampled, X_test_upsampled, y_train_upsampled, y_test_upsampled = train_test_split(X_upsampled, df_upsampled['label'], test_size=0.2, random_state=42)

X_train_downsampled, X_test_downsampled, y_train_downsampled, y_test_downsampled = train_test_split(X_downsampled, df_downsampled['label'], test_size=0.2, random_state=42)

In [24]:
model_upsampled = LogisticRegression()
model_upsampled.fit(X_train_upsampled, y_train_upsampled)
y_pred_upsampled = model_upsampled.predict(X_test_upsampled)

model_downsampled = LogisticRegression()
model_downsampled.fit(X_train_downsampled, y_train_downsampled)
y_pred_downsampled = model_downsampled.predict(X_test_downsampled)

In [25]:
def get_prfa(y_true, y_pred):
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    return precision, recall, f1, accuracy

In [26]:
precision_upsampled, recall_upsampled, f1_upsampled, accuracy_upsampled = get_prfa(y_test_upsampled, y_pred_upsampled)
precision_downsampled, recall_downsampled, f1_downsampled, accuracy_downsampled = get_prfa(y_test_downsampled, y_pred_downsampled)
print('Upsampled metrics:')
print('Precision:', precision_upsampled)
print('Recall:', recall_upsampled)
print('F1:', f1_upsampled)
print('Accuracy:', accuracy_upsampled)
print('Downsampled metrics:')
print('Precision:', precision_downsampled)
print('Recall:', recall_downsampled)
print('F1:', f1_downsampled)
print('Accuracy:', accuracy_downsampled)

Upsampled metrics:
Precision: 0.8446550816219063
Recall: 0.8636424821644905
F1: 0.8540432612312812
Accuracy: 0.8523828756058158
Downsampled metrics:
Precision: 0.8019976498237368
Recall: 0.8024691358024691
F1: 0.8022333235380547
Accuracy: 0.8036756126021003
