In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import confusion_matrix, accuracy_score, log_loss, auc
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\conno\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


The following notebook compares three appraoches for sentiment analysis, and also demonstrates the effect that feature engineering can have on overlal model performance.

In [None]:
## load kaggle dataset, filtering by only positive and negative tweets

kaggle_data = pd.read_csv("twitter_training.csv", names = ["Num", "Category", "Sentiment", "Tweet Text"]).drop(["Num", "Category"], axis=1)
kaggle_data = kaggle_data[(kaggle_data["Sentiment"] == "Positive") |\
                          (kaggle_data["Sentiment"] == "Negative")]


In [None]:
## load github dataset

github_url = 'https://raw.githubusercontent.com/surge-ai/stock-sentiment/refs/heads/main/sentiment.csv'
data = pd.read_csv(github_url, engine = "python", encoding = "unicode_escape")

URLError: <urlopen error [Errno 11001] getaddrinfo failed>

In [None]:
## optional - generate syntehtic dataset using 

In [None]:
## concatenate datasets

X = pd.concat([data["Tweet Text"].to_frame(), kaggle_data["Tweet Text"].to_frame()], axis=0)
y = pd.concat([data["Sentiment"], kaggle_data['Sentiment']])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

X_train.reset_index(drop=True)
X_test.reset_index(drop=True)
y_train.reset_index(drop=True)
y_test.reset_index(drop=True)

0       Negative
1       Negative
2       Positive
3       Positive
4       Positive
          ...   
8770    Negative
8771    Negative
8772    Negative
8773    Positive
8774    Positive
Name: Sentiment, Length: 8775, dtype: object

In [None]:
## change labels to 1 and 0, checking for any errors

classify = lambda s: 1 if s == "Positive" else 0 if s == "Negative" else -1
                     
y_test = y_test.apply(classify)
y_train = y_train.apply(classify)

if -1 in y_train.unique() or -1 in y_test.unique():
    print("Error: Unexpected value found in column labeled 'Sentiment'")

In [None]:
## counts of each class

y_train.value_counts()

Sentiment
0    18189
1    16910
Name: count, dtype: int64

In [92]:
X_train

Unnamed: 0,Tweet Text
41132,4 любимые игры всем время?... Max Payne 2. Fif...
54799,Dead of the Night and Ancient Evil are both at...
860,borderlands 3 sucks why are yall giving money ...
53209,Red Dead Redemption - With the litter twitch.t...
20502,Love to see this FeelsStrongMan
...,...
10505,a't wait!
18326,This incredible first look at the PS5 graphics...
64958,Hollywood is going to fend off this man. How m...
618,Super tempted to get a simple black heart tatt...


In [99]:
## helper function for finance/tweet specific language feature engineering

def count_word(word_list: list, sentence: str, aggregator = "freq"):
    
    """Counts the sum of the number of occurences of all the words in word_list

    Args:
        word_list (list): List of words to be counted.
        sentence (str): String being checked for occurences of words in word_list.

    Returns:
        int: Total number of occurences of all words in word_list.
    """
    total = 0
    
    for word in word_list:
        
        total += sentence.lower().count(word.lower())
        
    if aggregator == "count":
    
        return total
    
    elif aggregator == "freq":
    
        return total / (len(sentence.split()) + 1) ## adding plus one removes the possibility of a divide by zero error


def feature_engineering(df: pd.DataFrame, word_lists: dict, aggregator: str = "freq", inplace=False):
    """ Runs a simple feature engineering pipeline on passages of text, by counting occurences of words within defined, named lists

    Args:
        df (pd.DataFrame): Dataframe with column labeled "Tweet Text" containing tweets.
        word_lists (_type_): dictionary with keys representing the names of  important word lists (i.e. finance slang, tech slang, or profanity)
        aggregator (str, optional): Whether to calculate the count or frequency of terms in the term list. Defaults to "freq".
        inplace (bool, optional): Whether to modify df inplace or return a transformed dataframe. Defaults to False.

    Returns:
        df: transformed dataframe
    """
    for label, word_list in word_lists.items():
        
        df[label] = df["Tweet Text"].apply(lambda s: count_word(word_list, sentence=str(s), aggregator=aggregator))
      
    if not inplace:
        
        return df

In [100]:
## Track any lists of terms which may be useful, including names, slang, financial terms, and profanity

critical_word_lists = {
    "WSB Positive Slang":["moon", "tendies", "yolo", "cannot go tits up", "green", "We Like the Stock"],
    "Finance Positive Slang":["bull", "unicorn", "in the black", "growth", "potential", "skyrocket", "rallies", "up", "rally", "gain"],
    "Finance Negative Slang":["bear", "dying", "in the red", "slump", "fall", "drop", "plummet", "death", "loss", "down"],
    "Violence":['kill', 'murder', 'dead', "beat up"],
    "Profanity":["fuck", "shit", "ass", "crap", "suck", "jerk", "bitch"],
    "Stock Market People":["Elon", "Trump", "Musk", "Donald", "Gates", "Buffet", "Icahn", "Nadella", "Zuck"]
}

X_train = feature_engineering(X_train, critical_word_lists)
X_test = feature_engineering(X_test, critical_word_lists)

In [102]:
## Apply other feature engineering - take count of potentially important items i.e. all caps words, number of words,etc.
count_tokens = lambda s: len(str(s).split())
count_caps = lambda s: np.sum([word.isupper() for word in str(s).split()])
count_stocks = lambda s: len(re.findall(r'[$][A-Za-z][\S]*', str(s)))
    
X_train["Num Tokens"] = X_train["Tweet Text"].apply(count_tokens)
X_train["Num Caps"]  = X_train["Tweet Text"].apply(count_caps)
X_train["Num Stocks"] = X_train["Tweet Text"].apply(count_stocks)

X_test["Num Tokens"]  = X_test["Tweet Text"].apply(count_tokens)
X_test["Num Caps"]  = X_test["Tweet Text"].apply(count_caps)
X_test["Num Stocks"]  = X_test["Tweet Text"].apply(count_stocks)

In [103]:
X_train

Unnamed: 0,Tweet Text,WSB Positive Slang,Finance Positive Slang,Finance Negative Slang,Violence,Profanity,Stock Market People,Num Tokens,Num Caps,Num Stocks
41132,4 любимые игры всем время?... Max Payne 2. Fif...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,17,0.0,0
54799,Dead of the Night and Ancient Evil are both at...,0.0,0.000000,0.0,0.080000,0.000000,0.000000,24,0.0,0
860,borderlands 3 sucks why are yall giving money ...,0.0,0.000000,0.0,0.000000,0.043478,0.000000,22,0.0,0
53209,Red Dead Redemption - With the litter twitch.t...,0.0,0.000000,0.0,0.090909,0.000000,0.000000,10,0.0,0
20502,Love to see this FeelsStrongMan,0.0,0.000000,0.0,0.000000,0.000000,0.000000,5,0.0,0
...,...,...,...,...,...,...,...,...,...,...
10505,a't wait!,0.0,0.000000,0.0,0.000000,0.000000,0.000000,2,0.0,0
18326,This incredible first look at the PS5 graphics...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,37,3.0,0
64958,Hollywood is going to fend off this man. How m...,0.0,0.000000,0.0,0.000000,0.000000,0.066667,14,0.0,0
618,Super tempted to get a simple black heart tatt...,0.0,0.018182,0.0,0.000000,0.000000,0.000000,54,4.0,0


In [104]:
""" 
Format tweet by removing stopwords, puncutation, and urls, convert all to lowercase - 
leaving hashtags and mentions for now since they may convey significance as tokens.
"""

stop_words = set(stopwords.words('english'))

def format_tweet(tweet):
    tweet = re.sub(r'http\S+', '', str(tweet)) # remove URLs
    tweet = re.sub(r'[^\w\s]', '', tweet) #remove punctuation
    tweet = ' '.join(word for word in tweet.split() if word.lower() not in stop_words) #remove stopwords
    tweet = tweet.strip().lower() #strip and lowercase
    return tweet


X_train["Tweet Text"] = X_train["Tweet Text"].apply(format_tweet)
X_test["Tweet Text"] = X_test["Tweet Text"].apply(format_tweet)

Method 1: TF-IDF and TSVD

In [105]:
MAX_FEATURES = 200

train_tfidf = TfidfVectorizer(max_features=MAX_FEATURES, sublinear_tf = True)
test_tfidf = TfidfVectorizer(max_features=MAX_FEATURES, sublinear_tf = True)

X_train_encoded = train_tfidf.fit_transform(X_train["Tweet Text"].values.astype('U'))
X_test_encoded = train_tfidf.transform(X_test["Tweet Text"].values.astype('U'))

In [106]:
## apply dimensionality reduction - TSVD is useful for applying to sparse matrices
INPUT_DIM = 20

train_tsvd = TruncatedSVD(n_components = INPUT_DIM)

X_train_encoded = train_tsvd.fit_transform(X_train_encoded)
X_test_encoded = train_tsvd.transform(X_test_encoded)



In [107]:
## concatenate count features with text generated features

X_train_enc_fe = np.concatenate([X_train_encoded, X_train.loc[:, X_train.columns != 'Tweet Text'].to_numpy()], axis=1)
X_test_enc_fe = np.concatenate([X_test_encoded, X_test.loc[:, X_test.columns != 'Tweet Text'].to_numpy()], axis=1)

In [108]:
model_1 = XGBClassifier(enable_categorical = True,
                      objective = "binary:logistic", 
                      scale_pos_weight=0.56,
                      subsample=0.5)

clf_1 = GridSearchCV(model_1, 
                     param_grid={"gamma":[0.001, 0.01, 0.1],
                                 "max_depth":[4, 7, 10]},
                     cv=4)

clf_1.fit(X_train_encoded, y_train)

model_1_fe = XGBClassifier(enable_categorical = True,
                           objective = "binary:logistic", 
                           scale_pos_weight=0.56,
                           subsample=0.5)

clf_1_fe = GridSearchCV(model_1_fe, 
                   param_grid={"gamma":[0.001, 0.01, 0.1],
                               "max_depth":[4, 7, 10]},
                   cv=4)

clf_1_fe.fit(X_train_enc_fe, y_train)


In [119]:
%%time

y_pred_1 = clf_1.best_estimator_.predict(X_test_encoded)
y_pred_1_fe = clf_1_fe.best_estimator_.predict(X_test_enc_fe)
y_pred_1_prob = clf_1.best_estimator_.predict_proba(X_test_encoded)
y_pred_1_fe_prob = clf_1_fe.best_estimator_.predict_proba(X_test_enc_fe)

CPU times: total: 312 ms
Wall time: 67.9 ms


In [120]:
cm1 = confusion_matrix(y_test, y_pred_1)
cm1_fe = confusion_matrix(y_test, y_pred_1_fe)

In [121]:
acc_1 = accuracy_score(y_test, y_pred_1)
acc_1_fe = accuracy_score(y_test, y_pred_1_fe)


In [122]:
print(f"Model Accuracy Without Feature Engieering: {acc_1}")
print(f"Model Accuracy With Feature Engieering: {acc_1_fe}")

Model Accuracy Without Feature Engieering: 0.7911111111111111
Model Accuracy With Feature Engieering: 0.8144729344729345


In [123]:
auc_1 = fbeta_score(y_test, y_pred_1, beta=0.5)
auc_1_fe = fbeta_score(y_test, y_pred_1_fe, beta=0.5)

Method 2: Word2Vec and Classifier


In [125]:
!pip install gensim



In [126]:
import gensim.downloader as api

## Glove-twitter-25 is a pre-trained word vectorizer that is trained on Twitter data, and outputs a 25-dimensional embedding
glove_25 = api.load('glove-twitter-25')

In [None]:
def vectorize_sentence(sentence):
    
    """Create a length 25 semantic vector for "sentence" by taking the average of word vectors in the sentence.
    If no words in the sentence are contained within the glove-twitter-25 word vectorizer, return a 
    vector of 25 zeros"""
    
    vector = np.zeros(25)
    
    for word in sentence.split():
    
        if word in glove_25.index_to_key:
            
            vector += glove_25[word]
            
    if np.sum(vector) == 0:
        
        return vector
    
    else:
        vector = vector * (1/np.sum(vector))
        
        return vector
        

X_train_embedded = np.stack(X_train["Tweet Text"].apply(vectorize_sentence))
X_test_embedded = np.stack(X_test["Tweet Text"].apply(vectorize_sentence))

In [None]:
## defining training and test sets which used feature engineering

X_train_emb_fe = np.concatenate([X_train_embedded, X_train.loc[:, X_train.columns != 'Tweet Text'].to_numpy()], axis=1)
X_test_emb_fe = np.concatenate([X_test_embedded, X_test.loc[:, X_test.columns != 'Tweet Text'].to_numpy()], axis=1)

In [None]:
model_2 = XGBClassifier(enable_categorical = True,
                        objective = "binary:logistic", 
                        scale_pos_weight=0.56,
                        subsample=0.5)

clf_2 = GridSearchCV(model_2, 
                     param_grid={"gamma":[0.001, 0.01, 0.1],
                                 "max_depth":[4, 7, 10],
                                 "min_child_weight":[1, 2, 4]},
                     cv=4)

clf_2.fit(X_train_embedded, y_train)

model_2_fe = XGBClassifier(enable_categorical = True,
                           objective = "binary:logistic", 
                           scale_pos_weight=0.56,
                           subsample=0.5)

clf_2_fe = GridSearchCV(model_2_fe, 
                        param_grid={"gamma":[0.001, 0.01, 0.1],
                                    "max_depth":[4, 7, 10],
                                    "min_child_weight":[1, 2, 4]},
                        cv=4)

clf_2_fe.fit(X_train_emb_fe, y_train)

In [None]:
%%time
y_pred_2 = clf_2.predict(X_test_embedded)
y_pred_2_fe = clf_2_fe.predict(X_test_emb_fe)
y_pred_2_prob = clf_2.predict_proba(X_test_embedded)
y_pred_2_fe_prob = clf_2_fe.predict_proba(X_test_emb_fe)

In [None]:
confusion_matrix(y_test, y_pred_2)

array([[20, 16],
       [11, 53]], dtype=int64)

In [None]:
acc_2 = accuracy_score(y_test, y_pred_2)
acc_2_fe = accuracy_score(y_test, y_pred_2_prob)

0.73

In [None]:
auc_2 = auc(y_test, y_pred_2_prob, beta=0.5)
auc_2_fe = auc(y_test, y_pred_2_fe_prob, beta=0.5)

0.7794117647058824

Method 3: Pre-trained Bert Classifier

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.48.1-py3-none-any.whl.metadata (44 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.5.2-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Downloading transformers-4.48.1-py3-none-any.whl (9.7 MB)
   ---------------------------------------- 0.0/9.7 MB ? eta -:--:--
   ------------------------ --------------- 6.0/9.7 MB 40.9 MB/s eta 0:00:01
   ---------------------------------------- 9.7/9.7 MB 35.3 MB/s eta 0:00:00
Downloading safetensors-0.5.2-cp38-abi3-win_amd64.whl (303 kB)
Downloading tokenizers-0.21.0-cp39-abi3-win_amd64.whl (2.4 MB)
   ---------------------------------------- 0.0/2.4 MB ? eta -:--:--
   ---------------------------------------- 2.4/2.4 MB 68.6 MB/s eta 0:00:00
Installing collected packages: safetensors, tokenizers, transformers
Successfully installed safetensors

In [135]:
from transformers import BertModel, BertTokenizer, BertForSequenceClassification
import torch

In [198]:
## define tokenizer and model from transformers package

from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("lxyuan/distilbert-base-multilingual-cased-sentiments-student")
model = AutoModelForSequenceClassification.from_pretrained("lxyuan/distilbert-base-multilingual-cased-sentiments-student")

tokenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.92M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/759 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/541M [00:00<?, ?B/s]

In [117]:
from transformers import pipeline

distilled_student_sentiment_classifier = pipeline(
    model=model, 
    return_all_scores=False
)

NameError: name 'model' is not defined

In [None]:
predictions = distilled_student_sentiment_classifier(X_test["Tweet Text"].to_list())


NameError: name 'distilled_student_sentiment_classifier' is not defined

In [None]:
predictions

In [None]:
y_pred_3 = [0 if predictions[i]["label"] == "negative"\
            else 1 if predictions[i]["label"] == "positive"\
            else -1 for i in range(len(predictions))]

In [None]:
accuracy_score(y_pred_3, y_test)

0.83

In [None]:
Final_Results = pd.DataFrame({"Methods":["Method 1 w/0 F.E.",
                                        "Method 1 w/ F.E.",
                                        "Method 2 w/0 F.E.",
                                        "Method 2 w/ F.E.",
                                        "Method 3"],
                              "Accuracy":[acc_1,
                                          acc_1_fe,
                                          acc_2,
                                          acc_2_fe,
                                          acc_3],
                              "AUC": [auc_1,
                                      auc_1_fe,
                                      auc_2,
                                      auc_2_fe,
                                      auc_3]})