In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('../data/train.csv', encoding='latin-1')
test = pd.read_csv('../data/test.csv', encoding='latin-1')

In [3]:
train.head()

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my APL frie...
1,2,0,I missed the New Moon trail...
2,3,1,omg its already 7:30 :O
3,4,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,i think mi bf is cheating on me!!! ...


In [4]:
train.Sentiment.value_counts()

Sentiment
1    56457
0    43532
Name: count, dtype: int64

In [5]:
# Stratified sampling
from sklearn.model_selection import train_test_split

train, valid = train_test_split(train, test_size=0.25, stratify=train.Sentiment, random_state=239)
train.Sentiment.value_counts(), valid.Sentiment.value_counts()

(Sentiment
 1    42342
 0    32649
 Name: count, dtype: int64,
 Sentiment
 1    14115
 0    10883
 Name: count, dtype: int64)

In [6]:
class_weights = dict(1 / train.Sentiment.value_counts(normalize=True))
class_weights

{1: 1.77107836191016, 0: 2.2968850500781035}

In [7]:
# Remove stopwords
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download("stopwords")

stop_words = set(stopwords.words("english"))

def remove_stopwords(text):
    word_tokens = word_tokenize(text)
    filtered_text = [w for w in word_tokens if not w in stop_words]
    return ' '.join(filtered_text)

def remove_symbols(text):
    return re.sub(r'[^\w\s]', '', text)

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

def remove_single_characters(text):
    return re.sub(r'\b[a-zA-Z]\b', '', text)

def remove_multiple_spaces(text):
    return re.sub(r'\s+', ' ', text)

def remove_html_tags(text):
    return re.sub(r'<.*?>', '', text)

def remove_urls(text):
    return re.sub(r'http\S+', '', text)

def remove_mentions(text):
    return re.sub(r'@\S+', '', text)

def remove_hashtags(text):
    return re.sub(r'#\S+', '', text)

def remove_emojis(text):
    return re.sub(r'\\x\S+', '', text)

def clean_text(text):
    text = text.lower()
    text = remove_symbols(text)
    text = remove_numbers(text)
    text = remove_single_characters(text)
    text = remove_html_tags(text)
    text = remove_urls(text)
    text = remove_mentions(text)
    text = remove_hashtags(text)
    text = remove_emojis(text)
    text = remove_multiple_spaces(text)
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danielto1404/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
train['tweet'] = train['SentimentText'].apply(clean_text)
valid['tweet'] = valid['SentimentText'].apply(clean_text)
test['tweet'] = test['SentimentText'].apply(clean_text)

In [9]:
# TF-IDF with Logistic Regression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

tfidf = TfidfVectorizer(stop_words="english", max_features=20_000)
lr = LogisticRegression(penalty="l2", class_weight=class_weights, max_iter=100_000)
pipe = Pipeline([('tfidf', tfidf), ('lr', lr)])

pipe.fit(train.tweet, train.Sentiment)

In [10]:
# Train classification report
print(classification_report(train.Sentiment, pipe.predict(train.tweet)))

              precision    recall  f1-score   support

           0       0.79      0.81      0.80     32649
           1       0.85      0.83      0.84     42342

    accuracy                           0.83     74991
   macro avg       0.82      0.82      0.82     74991
weighted avg       0.83      0.83      0.83     74991



In [11]:
# Valid classification report
print(classification_report(valid.Sentiment, pipe.predict(valid.tweet)))

              precision    recall  f1-score   support

           0       0.70      0.73      0.71     10883
           1       0.78      0.76      0.77     14115

    accuracy                           0.74     24998
   macro avg       0.74      0.74      0.74     24998
weighted avg       0.75      0.74      0.75     24998

