In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/data.csv')

In [3]:
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [4]:
df.label.value_counts(normalize=True)

label
0    0.929854
1    0.070146
Name: proportion, dtype: float64

In [5]:
# Stratified sampling
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.25, stratify=df.label, random_state=239)
train.label.value_counts(), test.label.value_counts()

(label
 0    22290
 1     1681
 Name: count, dtype: int64,
 label
 0    7430
 1     561
 Name: count, dtype: int64)

In [6]:
class_weights = dict(1 / train.label.value_counts(normalize=True))
class_weights

{0: 1.0754149842978913, 1: 14.259964306960144}

In [7]:
# Save the train and test data
train.to_csv('../data/train_split.csv', index=False)
test.to_csv('../data/test_split.csv', index=False)

In [8]:
# Remove stopwords
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download("stopwords")

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    word_tokens = word_tokenize(text)
    filtered_text = [w for w in word_tokens if not w in stop_words]
    return ' '.join(filtered_text)

def remove_symbols(text):
    return re.sub(r'[^\w\s]', '', text)

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

def remove_single_characters(text):
    return re.sub(r'\b[a-zA-Z]\b', '', text)

def remove_multiple_spaces(text):
    return re.sub(r'\s+', ' ', text)

def remove_html_tags(text):
    return re.sub(r'<.*?>', '', text)

def remove_urls(text):
    return re.sub(r'http\S+', '', text)

def remove_mentions(text):
    return re.sub(r'@\S+', '', text)

def remove_hashtags(text):
    return re.sub(r'#\S+', '', text)

def remove_emojis(text):
    return re.sub(r'\\x\S+', '', text)

def clean_text(text):
    text = text.lower()
    text = remove_symbols(text)
    text = remove_numbers(text)
    text = remove_single_characters(text)
    text = remove_html_tags(text)
    text = remove_urls(text)
    text = remove_mentions(text)
    text = remove_hashtags(text)
    text = remove_emojis(text)
    text = remove_multiple_spaces(text)
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danielto1404/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
train['tweet'] = train['tweet'].apply(clean_text)
test['tweet'] = test['tweet'].apply(clean_text)

In [10]:
# TF-IDF with Logistic Regression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

tfidf = TfidfVectorizer(stop_words="english", max_features=20_000)
lr = LogisticRegression(penalty="l2", class_weight=class_weights, max_iter=100_000)
pipe = Pipeline([('tfidf', tfidf), ('lr', lr)])

pipe.fit(train.tweet, train.label)
pred = pipe.predict(test.tweet)

In [11]:
lr.coef_.shape

(1, 20000)

In [12]:
# Train classification report
print(classification_report(train.label, pipe.predict(train.tweet)))

              precision    recall  f1-score   support

           0       1.00      0.97      0.99     22290
           1       0.74      1.00      0.85      1681

    accuracy                           0.98     23971
   macro avg       0.87      0.98      0.92     23971
weighted avg       0.98      0.98      0.98     23971



In [13]:
# Test classification report
print(classification_report(test.label, pred))

              precision    recall  f1-score   support

           0       0.98      0.95      0.97      7430
           1       0.55      0.77      0.64       561

    accuracy                           0.94      7991
   macro avg       0.77      0.86      0.80      7991
weighted avg       0.95      0.94      0.94      7991



In [14]:
# Save the model
import pickle
import os

os.makedirs('../models', exist_ok=True)

with open('../models/tfidf_lr.pkl', 'wb') as f:
    pickle.dump(pipe, f)