In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("./data/tweets.csv", encoding='latin')
df['Date'] = pd.to_datetime(df['Date'], format='%a %b %d %H:%M:%S PDT %Y')
df_shuffled = df.sample(frac=1, random_state=2115).reset_index(drop=True)

train_size = int(len(df_shuffled) * 0.7)
df = df_shuffled[:train_size]
test_set = df_shuffled[train_size:]

In [3]:
df_0 = df[df["Target"] == 0].sample(frac=1, random_state=2115).reset_index(drop=True)[:50000]
df_1 = df[df["Target"] == 4].sample(frac=1, random_state=2115).reset_index(drop=True)[:50000]

df = pd.concat([df_0, df_1], ignore_index=True).sample(frac=1, random_state=2115).reset_index(drop=True)

In [4]:
try:
    df.drop(['ID', 'flag'], axis=1, inplace=True)
except KeyError:
    pass
df['Weekday'] = df['Date'].dt.weekday
df['Time'] = df['Date'].dt.time
df['Full_date'] = df['Date'].dt.date

df['Target'] = df['Target'].map({0: 0, 4: 1})
df['Length'] = df['Text'].apply(len)

import re


def count_hashtags(text):
    return len([c for c in text if c == '#'])


def count_mentions(text):
    return len([c for c in text if c == '@'])


def count_exclamation_marks(text):
    return len([c for c in text if c == '!'])


def detect_emoticons(text):
    emoticon_pattern = r'(:\)|:\(|;\)|:D|:P|:\||:\-\)|:\-\(|;\-\)|:‑D|:‑P|:‑\||<3)'
    return re.findall(emoticon_pattern, text)


df['Hashtags'] = df['Text'].apply(count_hashtags)
df['HasHashtags'] = df['Hashtags'] > 0
values_df = pd.DataFrame()
values_df['Hashtags'] = df['Hashtags'].value_counts()

df['Mentions'] = df['Text'].apply(count_mentions)
df['HasMentions'] = df['Mentions'] > 0
values_df['Mentions'] = df['Mentions'].value_counts()

df['ExclamationMarks'] = df['Text'].apply(count_exclamation_marks)
df['HasExclamationMarks'] = df['ExclamationMarks'] > 0
values_df['ExclamationMarks'] = df['ExclamationMarks'].value_counts()

df['Emoticons'] = df['Text'].apply(detect_emoticons)
emoticons = df[['Target', 'Emoticons']].explode('Emoticons').dropna().groupby(['Emoticons']).agg(
    ['mean', 'count']).reset_index()

In [5]:
import textstat

df['FRE'] = df['Text'].apply(lambda x: textstat.flesch_reading_ease(x))
df['GFI'] = df['Text'].apply(lambda x: textstat.gunning_fog(x))

In [6]:
from textblob import TextBlob


def analyze_textblob_sentiment(tweet):
    analysis = TextBlob(tweet)
    return analysis.sentiment.polarity, analysis.sentiment.subjectivity


df[['Polarity_TB', 'Subjectivity_TB']] = df['Text'].apply(lambda tweet: pd.Series(analyze_textblob_sentiment(tweet)))

In [7]:
from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()


def analyze_vader_sentiment(tweet):
    scores = sia.polarity_scores(tweet)
    return scores['pos'], scores['neu'], scores['neg'], scores['compound']


df[['Positive_VADER', 'Neutral_VADER', 'Negative_VADER', 'Compound_VADER']] = df['Text'].apply(
    lambda tweet: pd.Series(analyze_vader_sentiment(tweet)))


In [8]:
df.head()

Unnamed: 0,Target,Date,User,Text,Weekday,Time,Full_date,Length,Hashtags,HasHashtags,...,HasExclamationMarks,Emoticons,FRE,GFI,Polarity_TB,Subjectivity_TB,Positive_VADER,Neutral_VADER,Negative_VADER,Compound_VADER
0,0,2009-05-22 02:49:37,cheezburger,I'm sorry if my icon is messed up. Wanted a ne...,4,02:49:37,2009-05-22,111,0,False,...,False,[],89.45,2.92,-0.181818,0.727273,0.0,0.738,0.262,-0.6597
1,0,2009-06-03 04:27:27,teammartin,"Meanwhile, how the FUCK am I supposed to get 3...",2,04:27:27,2009-06-03,131,0,False,...,False,[],84.17,6.6,-0.15,0.45,0.086,0.65,0.265,-0.7378
2,0,2009-05-26 22:33:50,Roxie22,Great! @Daizz26 will disown me too!? LOL,1,22:33:50,2009-05-26,41,0,False,...,True,[],89.75,2.8,0.9,0.725,0.622,0.378,0.0,0.8488
3,0,2009-05-03 22:45:38,InFaMoUsHeRo,&quot;You have to be aware of the outcome of y...,6,22:45:38,2009-05-03,72,0,False,...,False,[],75.2,5.2,0.25,0.25,0.0,1.0,0.0,0.0
4,1,2009-05-18 04:18:44,optimismlover,@musicjunkie11 ahahahaha really now? after a f...,0,04:18:44,2009-05-18,57,0,False,...,False,[],33.58,6.6,0.0,0.15,0.0,1.0,0.0,0.0


In [9]:
df['Hour'] = df['Date'].dt.hour
df['skewed_hour_dist'] = df['Hour'].apply(lambda x: (16 - x) / 16 if x < 16 else (x - 16) / 8)
df.head()

Unnamed: 0,Target,Date,User,Text,Weekday,Time,Full_date,Length,Hashtags,HasHashtags,...,FRE,GFI,Polarity_TB,Subjectivity_TB,Positive_VADER,Neutral_VADER,Negative_VADER,Compound_VADER,Hour,skewed_hour_dist
0,0,2009-05-22 02:49:37,cheezburger,I'm sorry if my icon is messed up. Wanted a ne...,4,02:49:37,2009-05-22,111,0,False,...,89.45,2.92,-0.181818,0.727273,0.0,0.738,0.262,-0.6597,2,0.875
1,0,2009-06-03 04:27:27,teammartin,"Meanwhile, how the FUCK am I supposed to get 3...",2,04:27:27,2009-06-03,131,0,False,...,84.17,6.6,-0.15,0.45,0.086,0.65,0.265,-0.7378,4,0.75
2,0,2009-05-26 22:33:50,Roxie22,Great! @Daizz26 will disown me too!? LOL,1,22:33:50,2009-05-26,41,0,False,...,89.75,2.8,0.9,0.725,0.622,0.378,0.0,0.8488,22,0.75
3,0,2009-05-03 22:45:38,InFaMoUsHeRo,&quot;You have to be aware of the outcome of y...,6,22:45:38,2009-05-03,72,0,False,...,75.2,5.2,0.25,0.25,0.0,1.0,0.0,0.0,22,0.75
4,1,2009-05-18 04:18:44,optimismlover,@musicjunkie11 ahahahaha really now? after a f...,0,04:18:44,2009-05-18,57,0,False,...,33.58,6.6,0.0,0.15,0.0,1.0,0.0,0.0,4,0.75


In [10]:
#It is kind of forced but we can try doing similar thing with Weekday
df['skewed_week_dist'] = df['Weekday'].apply(lambda x: (2 - x) / 2 if x < 2 else (x - 2) / 4)
df.head()

Unnamed: 0,Target,Date,User,Text,Weekday,Time,Full_date,Length,Hashtags,HasHashtags,...,GFI,Polarity_TB,Subjectivity_TB,Positive_VADER,Neutral_VADER,Negative_VADER,Compound_VADER,Hour,skewed_hour_dist,skewed_week_dist
0,0,2009-05-22 02:49:37,cheezburger,I'm sorry if my icon is messed up. Wanted a ne...,4,02:49:37,2009-05-22,111,0,False,...,2.92,-0.181818,0.727273,0.0,0.738,0.262,-0.6597,2,0.875,0.5
1,0,2009-06-03 04:27:27,teammartin,"Meanwhile, how the FUCK am I supposed to get 3...",2,04:27:27,2009-06-03,131,0,False,...,6.6,-0.15,0.45,0.086,0.65,0.265,-0.7378,4,0.75,0.0
2,0,2009-05-26 22:33:50,Roxie22,Great! @Daizz26 will disown me too!? LOL,1,22:33:50,2009-05-26,41,0,False,...,2.8,0.9,0.725,0.622,0.378,0.0,0.8488,22,0.75,0.5
3,0,2009-05-03 22:45:38,InFaMoUsHeRo,&quot;You have to be aware of the outcome of y...,6,22:45:38,2009-05-03,72,0,False,...,5.2,0.25,0.25,0.0,1.0,0.0,0.0,22,0.75,1.0
4,1,2009-05-18 04:18:44,optimismlover,@musicjunkie11 ahahahaha really now? after a f...,0,04:18:44,2009-05-18,57,0,False,...,6.6,0.0,0.15,0.0,1.0,0.0,0.0,4,0.75,1.0


In [11]:
#We have seen that after certain date (2009-05-29 07:33:45) sentiment of all tweets is negative. Of course it is a feature
#strictly specific to our data and according to common sense shouldn't be used in the model if we would like to predict the sentiment
#of any text. However if testing is going to be done on our set alone it is a meaningful piece of information.
y = df[df['Target'] == 1]['Full_date'].max()
df['is_after_certain_day'] = df['Full_date'].apply(lambda x: 0 if x <= y else 1)
df.head()

Unnamed: 0,Target,Date,User,Text,Weekday,Time,Full_date,Length,Hashtags,HasHashtags,...,Polarity_TB,Subjectivity_TB,Positive_VADER,Neutral_VADER,Negative_VADER,Compound_VADER,Hour,skewed_hour_dist,skewed_week_dist,is_after_certain_day
0,0,2009-05-22 02:49:37,cheezburger,I'm sorry if my icon is messed up. Wanted a ne...,4,02:49:37,2009-05-22,111,0,False,...,-0.181818,0.727273,0.0,0.738,0.262,-0.6597,2,0.875,0.5,0
1,0,2009-06-03 04:27:27,teammartin,"Meanwhile, how the FUCK am I supposed to get 3...",2,04:27:27,2009-06-03,131,0,False,...,-0.15,0.45,0.086,0.65,0.265,-0.7378,4,0.75,0.0,1
2,0,2009-05-26 22:33:50,Roxie22,Great! @Daizz26 will disown me too!? LOL,1,22:33:50,2009-05-26,41,0,False,...,0.9,0.725,0.622,0.378,0.0,0.8488,22,0.75,0.5,0
3,0,2009-05-03 22:45:38,InFaMoUsHeRo,&quot;You have to be aware of the outcome of y...,6,22:45:38,2009-05-03,72,0,False,...,0.25,0.25,0.0,1.0,0.0,0.0,22,0.75,1.0,0
4,1,2009-05-18 04:18:44,optimismlover,@musicjunkie11 ahahahaha really now? after a f...,0,04:18:44,2009-05-18,57,0,False,...,0.0,0.15,0.0,1.0,0.0,0.0,4,0.75,1.0,0


In [12]:
#Thaks to one of correlation matrices from EDA (or to having at least 1 brain cell) we know that the number of hashtags, mentions or
#exclamation marks is highly correlated with its binary counterpart quantifier. From the same analysis we see that their correlation with 
# target is marginal however mentions and exclamation marks seem to have more to say than hashtags, that is why we 
#will leave out hashtags but not the other two.
def has_men(text):
    for c in text:
        if c == '@':
            return 1
    return 0


def has_exc(text):
    for c in text:
        if c == '!':
            return 1
        return 0


df['has_mentions'] = df['Text'].apply(has_men)
df['has_exclamation_marks'] = df['Text'].apply(has_exc)
df.head()

Unnamed: 0,Target,Date,User,Text,Weekday,Time,Full_date,Length,Hashtags,HasHashtags,...,Positive_VADER,Neutral_VADER,Negative_VADER,Compound_VADER,Hour,skewed_hour_dist,skewed_week_dist,is_after_certain_day,has_mentions,has_exclamation_marks
0,0,2009-05-22 02:49:37,cheezburger,I'm sorry if my icon is messed up. Wanted a ne...,4,02:49:37,2009-05-22,111,0,False,...,0.0,0.738,0.262,-0.6597,2,0.875,0.5,0,0,0
1,0,2009-06-03 04:27:27,teammartin,"Meanwhile, how the FUCK am I supposed to get 3...",2,04:27:27,2009-06-03,131,0,False,...,0.086,0.65,0.265,-0.7378,4,0.75,0.0,1,0,0
2,0,2009-05-26 22:33:50,Roxie22,Great! @Daizz26 will disown me too!? LOL,1,22:33:50,2009-05-26,41,0,False,...,0.622,0.378,0.0,0.8488,22,0.75,0.5,0,1,0
3,0,2009-05-03 22:45:38,InFaMoUsHeRo,&quot;You have to be aware of the outcome of y...,6,22:45:38,2009-05-03,72,0,False,...,0.0,1.0,0.0,0.0,22,0.75,1.0,0,0,0
4,1,2009-05-18 04:18:44,optimismlover,@musicjunkie11 ahahahaha really now? after a f...,0,04:18:44,2009-05-18,57,0,False,...,0.0,1.0,0.0,0.0,4,0.75,1.0,0,1,0


In [13]:
#And now for the main course tf-idf, we will set a limit o a 100 words so as not to everything too high in computation cost.
from sklearn.feature_extraction.text import TfidfVectorizer

text_data = df['Text'].tolist()

#We shall also remove certain english stop words because they are very frequent but do not carry much sentiment.
vectoriser = TfidfVectorizer(max_features=50, lowercase=True, stop_words='english')
tfidf_matrix = vectoriser.fit_transform(text_data)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectoriser.get_feature_names_out())
# df = pd.concat([df, tfidf_df], axis=1)
df.head()

Unnamed: 0,Target,Date,User,Text,Weekday,Time,Full_date,Length,Hashtags,HasHashtags,...,Positive_VADER,Neutral_VADER,Negative_VADER,Compound_VADER,Hour,skewed_hour_dist,skewed_week_dist,is_after_certain_day,has_mentions,has_exclamation_marks
0,0,2009-05-22 02:49:37,cheezburger,I'm sorry if my icon is messed up. Wanted a ne...,4,02:49:37,2009-05-22,111,0,False,...,0.0,0.738,0.262,-0.6597,2,0.875,0.5,0,0,0
1,0,2009-06-03 04:27:27,teammartin,"Meanwhile, how the FUCK am I supposed to get 3...",2,04:27:27,2009-06-03,131,0,False,...,0.086,0.65,0.265,-0.7378,4,0.75,0.0,1,0,0
2,0,2009-05-26 22:33:50,Roxie22,Great! @Daizz26 will disown me too!? LOL,1,22:33:50,2009-05-26,41,0,False,...,0.622,0.378,0.0,0.8488,22,0.75,0.5,0,1,0
3,0,2009-05-03 22:45:38,InFaMoUsHeRo,&quot;You have to be aware of the outcome of y...,6,22:45:38,2009-05-03,72,0,False,...,0.0,1.0,0.0,0.0,22,0.75,1.0,0,0,0
4,1,2009-05-18 04:18:44,optimismlover,@musicjunkie11 ahahahaha really now? after a f...,0,04:18:44,2009-05-18,57,0,False,...,0.0,1.0,0.0,0.0,4,0.75,1.0,0,1,0


In [14]:
import re
import emoji
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

custom_stopwords = set(stopwords.words('english')) - {'no', 'not', 'nor', "isn't", "wasn't", "aren't", "don't",
                                                      "didn't", "cannot", "couldn't", "shouldn't"}


def preprocess_tweet(tweet):
    tweet = str(tweet).lower()
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    tweet = re.sub(r'\@\w+|\#', '', tweet)
    tweet = re.sub(r'\W', ' ', tweet)
    tweet = emoji.demojize(tweet, delimiters=("", " "))
    tokens = word_tokenize(tweet)
    tokens = [word for word in tokens if word not in custom_stopwords]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)


df['ProcessedText'] = df['Text'].apply(preprocess_tweet)

In [15]:
from gensim.models import Word2Vec

texts = df['ProcessedText'].tolist()

tokenized_texts = [text.split() for text in texts]

word2vec_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)


def tweet_vector(tweet, model):
    words = tweet.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)
    else:
        return np.mean(word_vectors, axis=0)


df["w2v_feature"] = df['ProcessedText'].apply(lambda tweet: tweet_vector(tweet, word2vec_model))

In [16]:
df.head()

Unnamed: 0,Target,Date,User,Text,Weekday,Time,Full_date,Length,Hashtags,HasHashtags,...,Negative_VADER,Compound_VADER,Hour,skewed_hour_dist,skewed_week_dist,is_after_certain_day,has_mentions,has_exclamation_marks,ProcessedText,w2v_feature
0,0,2009-05-22 02:49:37,cheezburger,I'm sorry if my icon is messed up. Wanted a ne...,4,02:49:37,2009-05-22,111,0,False,...,0.262,-0.6597,2,0.875,0.5,0,0,0,sorry icon messed wanted new one display nothi...,"[-0.28894585, 0.097029716, 0.17234235, 0.12410..."
1,0,2009-06-03 04:27:27,teammartin,"Meanwhile, how the FUCK am I supposed to get 3...",2,04:27:27,2009-06-03,131,0,False,...,0.265,-0.7378,4,0.75,0.0,1,0,0,meanwhile fuck supposed get 3 assignment two a...,"[-0.7367761, 0.49975547, 0.0742093, -0.0368966..."
2,0,2009-05-26 22:33:50,Roxie22,Great! @Daizz26 will disown me too!? LOL,1,22:33:50,2009-05-26,41,0,False,...,0.0,0.8488,22,0.75,0.5,0,1,0,great disown lol,"[-0.21341069, 0.23222475, -0.02306, -0.4226969..."
3,0,2009-05-03 22:45:38,InFaMoUsHeRo,&quot;You have to be aware of the outcome of y...,6,22:45:38,2009-05-03,72,0,False,...,0.0,0.0,22,0.75,1.0,0,0,0,quot aware outcome action eric quot,"[-0.1330542, -0.055925637, 0.23729782, 0.40381..."
4,1,2009-05-18 04:18:44,optimismlover,@musicjunkie11 ahahahaha really now? after a f...,0,04:18:44,2009-05-18,57,0,False,...,0.0,0.0,4,0.75,1.0,0,1,0,ahahahaha really moment,"[0.07720827, 0.18317954, 0.26769403, 0.3113639..."


In [17]:
import nltk
from nltk.corpus import stopwords
import tiktoken
import numpy as np

encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
stop_words = set(stopwords.words('english'))


def get_weighted_embedding(text):
    tokens = nltk.word_tokenize(text)
    tagged_tokens = nltk.pos_tag(tokens)
    embeddings = []
    for word, tag in tagged_tokens:
        if word in stop_words:
            continue
        embedding = encoding.encode(word)
        weight = 1.5 if tag.startswith('JJ') else 1.0
        embeddings.append(np.average(embedding) * weight)

    if len(embeddings) == 0:
        return 0
    weighted_embeddings = np.average(embeddings, axis=0)
    return weighted_embeddings


df["tt_embedding_weighted"] = np.array([get_weighted_embedding(text) for text in texts])

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

texts = df['ProcessedText']
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
tfidf_array = tfidf_matrix.toarray()
feature_names = tfidf_vectorizer.get_feature_names_out()


def top_n_words(row, feature_names, n=3):
    top_n_idx = np.argsort(row)[-n:]
    top_n_values = [feature_names[i] for i in top_n_idx]
    return top_n_values


df[["Word_1", "Word_2", "Word_3"]] = [top_n_words(row, feature_names) for row in tfidf_array]
df

Unnamed: 0,Target,Date,User,Text,Weekday,Time,Full_date,Length,Hashtags,HasHashtags,...,skewed_week_dist,is_after_certain_day,has_mentions,has_exclamation_marks,ProcessedText,w2v_feature,tt_embedding_weighted,Word_1,Word_2,Word_3
0,0,2009-05-22 02:49:37,cheezburger,I'm sorry if my icon is messed up. Wanted a ne...,4,02:49:37,2009-05-22,111,0,False,...,0.50,0,0,0,sorry icon messed wanted new one display nothi...,"[-0.28894585, 0.097029716, 0.17234235, 0.12410...",30152.458333,nothing,give,wanted
1,0,2009-06-03 04:27:27,teammartin,"Meanwhile, how the FUCK am I supposed to get 3...",2,04:27:27,2009-06-03,131,0,False,...,0.00,1,0,0,meanwhile fuck supposed get 3 assignment two a...,"[-0.7367761, 0.49975547, 0.0742093, -0.0368966...",21376.866667,fuck,fail,supposed
2,0,2009-05-26 22:33:50,Roxie22,Great! @Daizz26 will disown me too!? LOL,1,22:33:50,2009-05-26,41,0,False,...,0.50,0,1,0,great disown lol,"[-0.21341069, 0.23222475, -0.02306, -0.4226969...",45415.916667,yup,lol,great
3,0,2009-05-03 22:45:38,InFaMoUsHeRo,&quot;You have to be aware of the outcome of y...,6,22:45:38,2009-05-03,72,0,False,...,1.00,0,0,0,quot aware outcome action eric quot,"[-0.1330542, -0.055925637, 0.23729782, 0.40381...",34647.000000,follower,yup,quot
4,1,2009-05-18 04:18:44,optimismlover,@musicjunkie11 ahahahaha really now? after a f...,0,04:18:44,2009-05-18,57,0,False,...,1.00,0,1,0,ahahahaha really moment,"[0.07720827, 0.18317954, 0.26769403, 0.3113639...",36777.777778,yup,really,moment
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,2009-04-07 05:57:16,AbigailBC,I'm back from library,1,05:57:16,2009-04-07,22,0,False,...,0.50,0,0,0,back library,"[-0.73960435, 0.6841266, 0.47910297, 0.0788341...",10000.500000,followfriday,yup,back
99996,1,2009-05-02 05:22:07,spike_marty,Wow what a morning! Had to take in starter pl...,5,05:22:07,2009-05-02,138,0,False,...,0.75,0,0,0,wow morning take starter plant last nite put m...,"[-0.42833695, 0.40740305, 0.13170674, -0.02426...",36034.666667,put,cold,nite
99997,1,2009-05-22 03:23:30,LTaguba,Here's the plan: GSP 2 ATL 2 DC (pick up mom) ...,4,03:23:30,2009-05-22,137,0,False,...,0.50,0,0,0,plan gsp 2 atl 2 dc pick mom 2 paris florence ...,"[-0.19445801, 0.35274148, 0.0698057, 0.0923885...",25803.011364,meet,plan,pick
99998,1,2009-05-29 00:34:02,petpostproject,@petxpert Pearl is doing great! She's fitting ...,4,00:34:02,2009-05-29,56,0,False,...,0.50,0,1,0,pearl great fitting right,"[-0.24424782, 0.10225247, 0.17117104, -0.17396...",20320.625000,yup,great,right


In [19]:
def get_tiktoken_embedding(text):
    embeddings = encoding.encode(text)
    if len(embeddings) == 0:
        return 0, 0, 0
    return np.mean(embeddings, axis=0), np.max(embeddings, axis=0), np.min(embeddings, axis=0)


for i in range(1, 4):
    words = df[f"Word_{i}"]
    df[[f"Word_{i}_embedding_mean", f"Word_{i}_embedding_max", f"Word_{i}_embedding_min"]] = np.array(
        [get_tiktoken_embedding(word) for word in words])

In [20]:
from sklearn.decomposition import LatentDirichletAllocation

n_topics = 10
lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
lda.fit(tfidf_matrix)
topic_distribution = lda.transform(tfidf_matrix)
dominant_topic = np.argmax(topic_distribution, axis=1)
df['DominantTopic'] = dominant_topic

In [21]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')


def generate_sentence_embeddings(texts, model):
    return model.encode(texts, batch_size=32, show_progress_bar=True)


texts = df['ProcessedText'].tolist()
embeddings = generate_sentence_embeddings(texts, model)
embeddings_list = embeddings.tolist()
df['sentence_embeddings'] = embeddings_list
for i in range(len(embeddings[0])):
    df[f'embedding_{i}'] = df['sentence_embeddings'].apply(lambda x: x[i])


  torch.utils._pytree._register_pytree_node(


Batches:   0%|          | 0/3125 [00:00<?, ?it/s]

  df[f'embedding_{i}'] = df['sentence_embeddings'].apply(lambda x: x[i])
  df[f'embedding_{i}'] = df['sentence_embeddings'].apply(lambda x: x[i])
  df[f'embedding_{i}'] = df['sentence_embeddings'].apply(lambda x: x[i])
  df[f'embedding_{i}'] = df['sentence_embeddings'].apply(lambda x: x[i])
  df[f'embedding_{i}'] = df['sentence_embeddings'].apply(lambda x: x[i])
  df[f'embedding_{i}'] = df['sentence_embeddings'].apply(lambda x: x[i])
  df[f'embedding_{i}'] = df['sentence_embeddings'].apply(lambda x: x[i])
  df[f'embedding_{i}'] = df['sentence_embeddings'].apply(lambda x: x[i])
  df[f'embedding_{i}'] = df['sentence_embeddings'].apply(lambda x: x[i])
  df[f'embedding_{i}'] = df['sentence_embeddings'].apply(lambda x: x[i])
  df[f'embedding_{i}'] = df['sentence_embeddings'].apply(lambda x: x[i])
  df[f'embedding_{i}'] = df['sentence_embeddings'].apply(lambda x: x[i])
  df[f'embedding_{i}'] = df['sentence_embeddings'].apply(lambda x: x[i])
  df[f'embedding_{i}'] = df['sentence_embeddings'].

In [22]:
import spacy

nlp = spacy.load("en_core_web_sm")


def get_named_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]


df['NamedEntities'] = df['Text'].apply(get_named_entities)
all_labels = set([label for sublist in df['NamedEntities'] for _, label in sublist])

for label in all_labels:
    df[f'Count_{label}'] = 0


def update_label_counts(row):
    label_counts = {label: 0 for label in all_labels}

    for _, label in row['NamedEntities']:
        label_counts[label] += 1

    for label, count in label_counts.items():
        row[f'Count_{label}'] = count

    return row


df = df.apply(update_label_counts, axis=1)

  df['NamedEntities'] = df['Text'].apply(get_named_entities)
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0


In [23]:
for i in range(word2vec_model.vector_size):
    df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])

df.drop(['w2v_feature', 'NamedEntities'], axis=1, inplace=True)

  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])
  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])
  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])
  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])
  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])
  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])
  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])
  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])
  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])
  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])
  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])
  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])
  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])
  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])
  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])
  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])
  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i

In [24]:
df.to_csv("./data/tweets_features.csv", index=False)

In [25]:
df.head()

Unnamed: 0,Target,Date,User,Text,Weekday,Time,Full_date,Length,Hashtags,HasHashtags,...,w2v_90,w2v_91,w2v_92,w2v_93,w2v_94,w2v_95,w2v_96,w2v_97,w2v_98,w2v_99
0,0,2009-05-22 02:49:37,cheezburger,I'm sorry if my icon is messed up. Wanted a ne...,4,02:49:37,2009-05-22,111,0,False,...,0.576463,0.272287,-0.270659,0.398829,0.80697,0.539171,0.261823,-0.655965,0.179224,-0.13259
1,0,2009-06-03 04:27:27,teammartin,"Meanwhile, how the FUCK am I supposed to get 3...",2,04:27:27,2009-06-03,131,0,False,...,0.400095,0.262307,0.052329,0.443299,0.895985,0.565586,0.221301,-0.797784,0.020691,0.002357
2,0,2009-05-26 22:33:50,Roxie22,Great! @Daizz26 will disown me too!? LOL,1,22:33:50,2009-05-26,41,0,False,...,0.460558,0.378846,-0.171764,0.227248,0.303033,0.620672,0.146584,-0.61591,0.112132,-0.088989
3,0,2009-05-03 22:45:38,InFaMoUsHeRo,&quot;You have to be aware of the outcome of y...,6,22:45:38,2009-05-03,72,0,False,...,0.535103,0.073591,-0.307203,0.651506,0.309582,0.018715,-0.205674,-0.59533,0.333111,0.00145
4,1,2009-05-18 04:18:44,optimismlover,@musicjunkie11 ahahahaha really now? after a f...,0,04:18:44,2009-05-18,57,0,False,...,0.484578,0.324513,-0.259205,0.183831,0.745856,0.448028,0.359533,-0.555516,-0.176974,-0.091381
