In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("./data/tweets.csv", encoding='latin')
df['Date'] = pd.to_datetime(df['Date'], format='%a %b %d %H:%M:%S PDT %Y')
df = df[df['Date'] < pd.Timestamp('2009-05-28')]
# df = df.head(50)

try:
    df.drop(['ID', 'flag'], axis=1, inplace=True)
except KeyError:
    pass
df['Weekday'] = df['Date'].dt.weekday
df['Time'] = df['Date'].dt.time
df['Full_date'] = df['Date'].dt.date

df['Target'] = df['Target'].map({0: 0, 4: 1})
df['Length'] = df['Text'].apply(len)

import re


def count_hashtags(text):
    return len([c for c in text if c == '#'])


def count_mentions(text):
    return len([c for c in text if c == '@'])


def count_exclamation_marks(text):
    return len([c for c in text if c == '!'])


def detect_emoticons(text):
    emoticon_pattern = r'(:\)|:\(|;\)|:D|:P|:\||:\-\)|:\-\(|;\-\)|:‑D|:‑P|:‑\||<3)'
    return re.findall(emoticon_pattern, text)


df['Hashtags'] = df['Text'].apply(count_hashtags)
df['HasHashtags'] = df['Hashtags'] > 0
values_df = pd.DataFrame()
values_df['Hashtags'] = df['Hashtags'].value_counts()

df['Mentions'] = df['Text'].apply(count_mentions)
df['HasMentions'] = df['Mentions'] > 0
values_df['Mentions'] = df['Mentions'].value_counts()

df['ExclamationMarks'] = df['Text'].apply(count_exclamation_marks)
df['HasExclamationMarks'] = df['ExclamationMarks'] > 0
values_df['ExclamationMarks'] = df['ExclamationMarks'].value_counts()

df['Emoticons'] = df['Text'].apply(detect_emoticons)
emoticons = df[['Target', 'Emoticons']].explode('Emoticons').dropna().groupby(['Emoticons']).agg(
    ['mean', 'count']).reset_index()

In [3]:
import textstat
df['FRE'] = df['Text'].apply(lambda x: textstat.flesch_reading_ease(x))
df['GFI'] = df['Text'].apply(lambda x: textstat.gunning_fog(x))

In [4]:
from textblob import TextBlob

def analyze_textblob_sentiment(tweet):
    analysis = TextBlob(tweet)
    return analysis.sentiment.polarity, analysis.sentiment.subjectivity

df[['Polarity_TB', 'Subjectivity_TB']] = df['Text'].apply(lambda tweet: pd.Series(analyze_textblob_sentiment(tweet)))

In [5]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

def analyze_vader_sentiment(tweet):
    scores = sia.polarity_scores(tweet)
    return scores['pos'], scores['neu'], scores['neg'], scores['compound']

df[['Positive_VADER', 'Neutral_VADER', 'Negative_VADER', 'Compound_VADER']] = df['Text'].apply(lambda tweet: pd.Series(analyze_vader_sentiment(tweet)))


In [6]:
df.head()

Unnamed: 0,Target,Date,User,Text,Weekday,Time,Full_date,Length,Hashtags,HasHashtags,...,HasExclamationMarks,Emoticons,FRE,GFI,Polarity_TB,Subjectivity_TB,Positive_VADER,Neutral_VADER,Negative_VADER,Compound_VADER
0,0,2009-04-06 22:19:49,scotthamilton,is upset that he can't update his Facebook by ...,0,22:19:49,2009-04-06,111,0,False,...,True,[],86.2,4.2,0.0,0.0,0.0,0.697,0.303,-0.75
1,0,2009-04-06 22:19:53,mattycus,@Kenichan I dived many times for the ball. Man...,0,22:19:53,2009-04-06,89,0,False,...,False,[],104.64,3.6,0.5,0.5,0.167,0.833,0.0,0.4939
2,0,2009-04-06 22:19:57,ElleCTF,my whole body feels itchy and like its on fire,0,22:19:57,2009-04-06,47,0,False,...,False,[],112.09,4.0,0.2,0.4,0.179,0.5,0.321,-0.25
3,0,2009-04-06 22:19:57,Karoli,"@nationwideclass no, it's not behaving at all....",0,22:19:57,2009-04-06,111,0,False,...,False,[],89.75,6.61,-0.625,1.0,0.0,0.759,0.241,-0.6597
4,0,2009-04-06 22:20:00,joy_wolf,@Kwesidei not the whole crew,0,22:20:00,2009-04-06,29,0,False,...,False,[],117.16,2.0,0.2,0.4,0.0,1.0,0.0,0.0


In [7]:
df.to_csv("./Data/tweets_features.csv", index=False)

In [8]:
df['Hour'] = df['Date'].dt.hour
df['skewed_hour_dist'] = df['Hour'].apply(lambda x: (16 - x) / 16 if x < 16 else (x - 16) / 8)
df.head()

Unnamed: 0,Target,Date,User,Text,Weekday,Time,Full_date,Length,Hashtags,HasHashtags,...,FRE,GFI,Polarity_TB,Subjectivity_TB,Positive_VADER,Neutral_VADER,Negative_VADER,Compound_VADER,Hour,skewed_hour_dist
0,0,2009-04-06 22:19:49,scotthamilton,is upset that he can't update his Facebook by ...,0,22:19:49,2009-04-06,111,0,False,...,86.2,4.2,0.0,0.0,0.0,0.697,0.303,-0.75,22,0.75
1,0,2009-04-06 22:19:53,mattycus,@Kenichan I dived many times for the ball. Man...,0,22:19:53,2009-04-06,89,0,False,...,104.64,3.6,0.5,0.5,0.167,0.833,0.0,0.4939,22,0.75
2,0,2009-04-06 22:19:57,ElleCTF,my whole body feels itchy and like its on fire,0,22:19:57,2009-04-06,47,0,False,...,112.09,4.0,0.2,0.4,0.179,0.5,0.321,-0.25,22,0.75
3,0,2009-04-06 22:19:57,Karoli,"@nationwideclass no, it's not behaving at all....",0,22:19:57,2009-04-06,111,0,False,...,89.75,6.61,-0.625,1.0,0.0,0.759,0.241,-0.6597,22,0.75
4,0,2009-04-06 22:20:00,joy_wolf,@Kwesidei not the whole crew,0,22:20:00,2009-04-06,29,0,False,...,117.16,2.0,0.2,0.4,0.0,1.0,0.0,0.0,22,0.75


In [9]:
#It is kind of forced but we can try doing similar thing with Weekday
df['skewed_week_dist'] = df['Weekday'].apply(lambda x: (2 - x) / 2 if x < 2 else (x - 2) / 4)
df.head()

Unnamed: 0,Target,Date,User,Text,Weekday,Time,Full_date,Length,Hashtags,HasHashtags,...,GFI,Polarity_TB,Subjectivity_TB,Positive_VADER,Neutral_VADER,Negative_VADER,Compound_VADER,Hour,skewed_hour_dist,skewed_week_dist
0,0,2009-04-06 22:19:49,scotthamilton,is upset that he can't update his Facebook by ...,0,22:19:49,2009-04-06,111,0,False,...,4.2,0.0,0.0,0.0,0.697,0.303,-0.75,22,0.75,1.0
1,0,2009-04-06 22:19:53,mattycus,@Kenichan I dived many times for the ball. Man...,0,22:19:53,2009-04-06,89,0,False,...,3.6,0.5,0.5,0.167,0.833,0.0,0.4939,22,0.75,1.0
2,0,2009-04-06 22:19:57,ElleCTF,my whole body feels itchy and like its on fire,0,22:19:57,2009-04-06,47,0,False,...,4.0,0.2,0.4,0.179,0.5,0.321,-0.25,22,0.75,1.0
3,0,2009-04-06 22:19:57,Karoli,"@nationwideclass no, it's not behaving at all....",0,22:19:57,2009-04-06,111,0,False,...,6.61,-0.625,1.0,0.0,0.759,0.241,-0.6597,22,0.75,1.0
4,0,2009-04-06 22:20:00,joy_wolf,@Kwesidei not the whole crew,0,22:20:00,2009-04-06,29,0,False,...,2.0,0.2,0.4,0.0,1.0,0.0,0.0,22,0.75,1.0


In [10]:
#We have seen that after certain date (2009-05-29 07:33:45) sentiment of all tweets is negative. Of course it is a feature
#strictly specific to our data and according to common sense shouldn't be used in the model if we would like to predict the sentiment
#of any text. However if testing is going to be done on our set alone it is a meaningful piece of information.
y = df[df['Target'] == 1]['Full_date'].max()
df['is_after_certain_day'] = df['Full_date'].apply(lambda x: 0 if x <= y else 1)
df.head()

Unnamed: 0,Target,Date,User,Text,Weekday,Time,Full_date,Length,Hashtags,HasHashtags,...,Polarity_TB,Subjectivity_TB,Positive_VADER,Neutral_VADER,Negative_VADER,Compound_VADER,Hour,skewed_hour_dist,skewed_week_dist,is_after_certain_day
0,0,2009-04-06 22:19:49,scotthamilton,is upset that he can't update his Facebook by ...,0,22:19:49,2009-04-06,111,0,False,...,0.0,0.0,0.0,0.697,0.303,-0.75,22,0.75,1.0,0
1,0,2009-04-06 22:19:53,mattycus,@Kenichan I dived many times for the ball. Man...,0,22:19:53,2009-04-06,89,0,False,...,0.5,0.5,0.167,0.833,0.0,0.4939,22,0.75,1.0,0
2,0,2009-04-06 22:19:57,ElleCTF,my whole body feels itchy and like its on fire,0,22:19:57,2009-04-06,47,0,False,...,0.2,0.4,0.179,0.5,0.321,-0.25,22,0.75,1.0,0
3,0,2009-04-06 22:19:57,Karoli,"@nationwideclass no, it's not behaving at all....",0,22:19:57,2009-04-06,111,0,False,...,-0.625,1.0,0.0,0.759,0.241,-0.6597,22,0.75,1.0,0
4,0,2009-04-06 22:20:00,joy_wolf,@Kwesidei not the whole crew,0,22:20:00,2009-04-06,29,0,False,...,0.2,0.4,0.0,1.0,0.0,0.0,22,0.75,1.0,0


In [11]:
#Thaks to one of correlation matrices from EDA (or to having at least 1 brain cell) we know that the number of hashtags, mentions or
#exclamation marks is highly correlated with its binary counterpart quantifier. From the same analysis we see that their correlation with 
# target is marginal however mentions and exclamation marks seem to have more to say than hashtags, that is why we 
#will leave out hashtags but not the other two.
def has_men(text):
    for c in text:
        if c == '@':
            return 1
    return 0


def has_exc(text):
    for c in text:
        if c == '!':
            return 1
        return 0


df['has_mentions'] = df['Text'].apply(has_men)
df['has_exclamation_marks'] = df['Text'].apply(has_exc)
df.head()

Unnamed: 0,Target,Date,User,Text,Weekday,Time,Full_date,Length,Hashtags,HasHashtags,...,Positive_VADER,Neutral_VADER,Negative_VADER,Compound_VADER,Hour,skewed_hour_dist,skewed_week_dist,is_after_certain_day,has_mentions,has_exclamation_marks
0,0,2009-04-06 22:19:49,scotthamilton,is upset that he can't update his Facebook by ...,0,22:19:49,2009-04-06,111,0,False,...,0.0,0.697,0.303,-0.75,22,0.75,1.0,0,0,0
1,0,2009-04-06 22:19:53,mattycus,@Kenichan I dived many times for the ball. Man...,0,22:19:53,2009-04-06,89,0,False,...,0.167,0.833,0.0,0.4939,22,0.75,1.0,0,1,0
2,0,2009-04-06 22:19:57,ElleCTF,my whole body feels itchy and like its on fire,0,22:19:57,2009-04-06,47,0,False,...,0.179,0.5,0.321,-0.25,22,0.75,1.0,0,0,0
3,0,2009-04-06 22:19:57,Karoli,"@nationwideclass no, it's not behaving at all....",0,22:19:57,2009-04-06,111,0,False,...,0.0,0.759,0.241,-0.6597,22,0.75,1.0,0,1,0
4,0,2009-04-06 22:20:00,joy_wolf,@Kwesidei not the whole crew,0,22:20:00,2009-04-06,29,0,False,...,0.0,1.0,0.0,0.0,22,0.75,1.0,0,1,0


In [12]:
#And now for the main course tf-idf, we will set a limit o a 100 words so as not to everything too high in computation cost.
from sklearn.feature_extraction.text import TfidfVectorizer

text_data = df['Text'].tolist()

#We shall also remove certain english stop words because they are very frequent but do not carry much sentiment.
vectoriser = TfidfVectorizer(max_features=100, lowercase=True, stop_words='english')
tfidf_matrix = vectoriser.fit_transform(text_data)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectoriser.get_feature_names_out())
df_tfidf = pd.concat([df, tfidf_df], axis=1)
df_tfidf.head()

Unnamed: 0,Target,Date,User,Text,Weekday,Time,Full_date,Length,Hashtags,HasHashtags,...,watching,way,week,weekend,wish,work,working,yay,yeah,yes
0,0.0,2009-04-06 22:19:49,scotthamilton,is upset that he can't update his Facebook by ...,0.0,22:19:49,2009-04-06,111.0,0.0,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,2009-04-06 22:19:53,mattycus,@Kenichan I dived many times for the ball. Man...,0.0,22:19:53,2009-04-06,89.0,0.0,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,2009-04-06 22:19:57,ElleCTF,my whole body feels itchy and like its on fire,0.0,22:19:57,2009-04-06,47.0,0.0,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,2009-04-06 22:19:57,Karoli,"@nationwideclass no, it's not behaving at all....",0.0,22:19:57,2009-04-06,111.0,0.0,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,2009-04-06 22:20:00,joy_wolf,@Kwesidei not the whole crew,0.0,22:20:00,2009-04-06,29.0,0.0,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
import re
import emoji
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

custom_stopwords = set(stopwords.words('english')) - {'no', 'not', 'nor', "isn't", "wasn't", "aren't", "don't",
                                                      "didn't", "cannot", "couldn't", "shouldn't"}


def preprocess_tweet(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    tweet = re.sub(r'\@\w+|\#', '', tweet)
    tweet = re.sub(r'\W', ' ', tweet)
    tweet = emoji.demojize(tweet, delimiters=("", " "))
    tokens = word_tokenize(tweet)
    tokens = [word for word in tokens if word not in custom_stopwords]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)


df['ProcessedText'] = df['Text'].apply(preprocess_tweet)

In [14]:
from gensim.models import Word2Vec

texts = df['ProcessedText'].tolist()

tokenized_texts = [text.split() for text in texts]

word2vec_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)


def tweet_vector(tweet, model):
    words = tweet.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)
    else:
        return np.mean(word_vectors, axis=0)


df["w2v"] = np.array([np.mean(tweet_vector(tweet, word2vec_model)) for tweet in texts])

In [15]:
import tiktoken
import numpy as np

encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")


def get_tiktoken_embedding(text):
    embeddings = encoding.encode(text)
    if len(embeddings) == 0:
        return 0, 0, 0
    return np.mean(embeddings, axis=0), np.max(embeddings, axis=0), np.min(embeddings, axis=0)


df[["tt_embedding_mean", "tt_embedding_max", "tt_embedding_min"]] = np.array(
    [get_tiktoken_embedding(text) for text in texts])

In [16]:
import nltk
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))


def get_weighted_embedding(text):
    tokens = nltk.word_tokenize(text)
    tagged_tokens = nltk.pos_tag(tokens)
    embeddings = []
    weights = []
    for word, tag in tagged_tokens:
        if word in stop_words:
            continue
        embedding = encoding.encode(word)
        weight = 1.5 if tag.startswith('JJ') else 1.0
        embeddings.append(np.average(embedding) * weight)

    if len(embeddings) == 0:
        return 0
    weighted_embeddings = np.average(embeddings, axis=0)
    return weighted_embeddings


df["tt_embedding_weighted"] = np.array([get_weighted_embedding(text) for text in texts])

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

texts = df['ProcessedText']
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
tfidf_array = tfidf_matrix.toarray()
feature_names = tfidf_vectorizer.get_feature_names_out()
def top_n_words(row, feature_names, n=3):
    top_n_idx = np.argsort(row)[-n:]
    top_n_values = [feature_names[i] for i in top_n_idx]
    return top_n_values
df[["Word_1", "Word_2", "Word_3"]] = [top_n_words(row, feature_names) for row in tfidf_array]

In [18]:
def get_tiktoken_embedding(text):
    embeddings = encoding.encode(text)
    if len(embeddings) == 0:
        return 0, 0, 0
    return np.mean(embeddings, axis=0), np.max(embeddings, axis=0), np.min(embeddings, axis=0)

for i in range(1, 4):
    words = df[f"Word_{i}"]
    df[[f"Word_{i}_embedding_mean", f"Word_{i}_embedding_max", f"Word_{i}_embedding_min"]] = np.array(
        [get_tiktoken_embedding(word) for word in words])

In [19]:
df

Unnamed: 0,Target,Date,User,Text,Weekday,Time,Full_date,Length,Hashtags,HasHashtags,...,Word_3,Word_1_embedding_mean,Word_1_embedding_max,Word_1_embedding_min,Word_2_embedding_mean,Word_2_embedding_max,Word_2_embedding_min,Word_3_embedding_mean,Word_3_embedding_max,Word_3_embedding_min
0,0,2009-04-06 22:19:49,scotthamilton,is upset that he can't update his Facebook by ...,0,22:19:49,2009-04-06,111,0,False,...,result,21617.0,21617.0,21617.0,603.000000,751.0,455.0,1407.0,1407.0,1407.0
1,0,2009-04-06 22:19:53,mattycus,@Kenichan I dived many times for the ball. Man...,0,22:19:53,2009-04-06,89,0,False,...,50,6766.0,6766.0,6766.0,4047.000000,4047.0,4047.0,1135.0,1135.0,1135.0
2,0,2009-04-06 22:19:57,ElleCTF,my whole body feels itchy and like its on fire,0,22:19:57,2009-04-06,47,0,False,...,fire,67733.0,67733.0,67733.0,2664.000000,2664.0,2664.0,11029.0,11029.0,11029.0
3,0,2009-04-06 22:19:57,Karoli,"@nationwideclass no, it's not behaving at all....",0,22:19:57,2009-04-06,111,0,False,...,mad,2201.0,2201.0,2201.0,4151.000000,4151.0,4151.0,20920.0,20920.0,20920.0
4,0,2009-04-06 22:20:00,joy_wolf,@Kwesidei not the whole crew,0,22:20:00,2009-04-06,29,0,False,...,whole,271.5,455.0,88.0,1962.000000,1962.0,1962.0,67733.0,67733.0,67733.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1033282,1,2009-05-27 07:27:38,LizAnjos,"@NintendoRed sometimes, you just have to let go!",2,07:27:38,2009-05-27,49,0,False,...,sometimes,3427.0,3427.0,3427.0,1169.000000,1169.0,1169.0,57753.0,57753.0,57753.0
1033283,1,2009-05-27 07:27:38,TiernanDouieb,@misswiz ah. Perhaps point her in the directio...,2,07:27:38,2009-05-27,64,0,False,...,point,606.0,606.0,606.0,1494.000000,1494.0,1494.0,2837.0,2837.0,2837.0
1033284,1,2009-05-27 07:27:38,ExpertDater,@SingleInThe604 Nice to meet you u too! Im gla...,2,07:27:38,2009-05-27,70,0,False,...,article,64510.0,64510.0,64510.0,1591.333333,4215.0,268.0,7203.0,7203.0,7203.0
1033285,1,2009-05-27 07:27:38,bpende,"@ChrisCavs : I love you, man... you making me ...",2,07:27:38,2009-05-27,71,0,False,...,making,58234.0,58234.0,58234.0,1543.000000,1543.0,1543.0,28936.0,28936.0,28936.0


In [20]:
from sklearn.decomposition import LatentDirichletAllocation

n_topics = 10
lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
lda.fit(tfidf_matrix)
topic_distribution = lda.transform(tfidf_matrix)
dominant_topic = np.argmax(topic_distribution, axis=1)
df['DominantTopic'] = dominant_topic

In [27]:
df.to_csv("./Data/tweets_features.csv", index=False)