In [54]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [55]:
df = pd.read_csv("./data/tweets.csv", encoding='latin')
df['Date'] = pd.to_datetime(df['Date'], format='%a %b %d %H:%M:%S PDT %Y')
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)

train_size = int(len(df_shuffled) * 0.7)
df = df_shuffled[:train_size]
test_set = df_shuffled[train_size:]

In [56]:
df = df[:10000]

In [57]:
try:
    df.drop(['ID', 'flag'], axis=1, inplace=True)
except KeyError:
    pass
df['Weekday'] = df['Date'].dt.weekday
df['Time'] = df['Date'].dt.time
df['Full_date'] = df['Date'].dt.date

df['Target'] = df['Target'].map({0: 0, 4: 1})
df['Length'] = df['Text'].apply(len)

import re

def count_hashtags(text):
    return len([c for c in text if c == '#'])


def count_mentions(text):
    return len([c for c in text if c == '@'])


def count_exclamation_marks(text):
    return len([c for c in text if c == '!'])


def detect_emoticons(text):
    emoticon_pattern = r'(:\)|:\(|;\)|:D|:P|:\||:\-\)|:\-\(|;\-\)|:‑D|:‑P|:‑\||<3)'
    return re.findall(emoticon_pattern, text)


df['Hashtags'] = df['Text'].apply(count_hashtags)
df['HasHashtags'] = df['Hashtags'] > 0
values_df = pd.DataFrame()
values_df['Hashtags'] = df['Hashtags'].value_counts()

df['Mentions'] = df['Text'].apply(count_mentions)
df['HasMentions'] = df['Mentions'] > 0
values_df['Mentions'] = df['Mentions'].value_counts()

df['ExclamationMarks'] = df['Text'].apply(count_exclamation_marks)
df['HasExclamationMarks'] = df['ExclamationMarks'] > 0
values_df['ExclamationMarks'] = df['ExclamationMarks'].value_counts()

df['Emoticons'] = df['Text'].apply(detect_emoticons)
emoticons = df[['Target', 'Emoticons']].explode('Emoticons').dropna().groupby(['Emoticons']).agg(
    ['mean', 'count']).reset_index()

In [58]:
import textstat
df['FRE'] = df['Text'].apply(lambda x: textstat.flesch_reading_ease(x))
df['GFI'] = df['Text'].apply(lambda x: textstat.gunning_fog(x))

In [59]:
from textblob import TextBlob

def analyze_textblob_sentiment(tweet):
    analysis = TextBlob(tweet)
    return analysis.sentiment.polarity, analysis.sentiment.subjectivity

df[['Polarity_TB', 'Subjectivity_TB']] = df['Text'].apply(lambda tweet: pd.Series(analyze_textblob_sentiment(tweet)))

In [60]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

def analyze_vader_sentiment(tweet):
    scores = sia.polarity_scores(tweet)
    return scores['pos'], scores['neu'], scores['neg'], scores['compound']

df[['Positive_VADER', 'Neutral_VADER', 'Negative_VADER', 'Compound_VADER']] = df['Text'].apply(lambda tweet: pd.Series(analyze_vader_sentiment(tweet)))


In [61]:
df.head()

Unnamed: 0,Target,Date,User,Text,Weekday,Time,Full_date,Length,Hashtags,HasHashtags,...,HasExclamationMarks,Emoticons,FRE,GFI,Polarity_TB,Subjectivity_TB,Positive_VADER,Neutral_VADER,Negative_VADER,Compound_VADER
0,0,2009-06-25 00:52:47,timofuchs,@grazzly maaaaaate will you be at Glasto? Not ...,3,00:52:47,2009-06-25,67,0,False,...,False,[],82.81,2.2,0.0,0.0,0.0,1.0,0.0,0.0
1,1,2009-05-14 02:06:48,markdavidson,@Monkeylover35 Yeah. I had to get away for a b...,3,02:06:48,2009-05-14,93,0,False,...,False,[],96.69,3.4,0.5,0.5,0.29,0.71,0.0,0.5994
2,1,2009-05-04 07:14:48,Maekii,The World is just amazing!,0,07:14:48,2009-05-04,27,0,False,...,True,[],100.24,2.0,0.75,0.9,0.506,0.494,0.0,0.6239
3,0,2009-06-25 02:35:25,MissElisee,Just picked my sister up from work. she's al...,3,02:35:25,2009-06-25,100,0,False,...,False,[],95.17,4.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0,2009-06-19 13:40:45,MissK20,I don't think changing my last name is meant t...,4,13:40:45,2009-06-19,52,0,False,...,True,[],102.61,4.4,0.0,0.066667,0.0,1.0,0.0,0.0


In [62]:
df.to_csv("./Data/tweets_features.csv", index=False)

In [63]:
df['Hour'] = df['Date'].dt.hour
df['skewed_hour_dist'] = df['Hour'].apply(lambda x: (16 - x) / 16 if x < 16 else (x - 16) / 8)
df.head()

Unnamed: 0,Target,Date,User,Text,Weekday,Time,Full_date,Length,Hashtags,HasHashtags,...,FRE,GFI,Polarity_TB,Subjectivity_TB,Positive_VADER,Neutral_VADER,Negative_VADER,Compound_VADER,Hour,skewed_hour_dist
0,0,2009-06-25 00:52:47,timofuchs,@grazzly maaaaaate will you be at Glasto? Not ...,3,00:52:47,2009-06-25,67,0,False,...,82.81,2.2,0.0,0.0,0.0,1.0,0.0,0.0,0,1.0
1,1,2009-05-14 02:06:48,markdavidson,@Monkeylover35 Yeah. I had to get away for a b...,3,02:06:48,2009-05-14,93,0,False,...,96.69,3.4,0.5,0.5,0.29,0.71,0.0,0.5994,2,0.875
2,1,2009-05-04 07:14:48,Maekii,The World is just amazing!,0,07:14:48,2009-05-04,27,0,False,...,100.24,2.0,0.75,0.9,0.506,0.494,0.0,0.6239,7,0.5625
3,0,2009-06-25 02:35:25,MissElisee,Just picked my sister up from work. she's al...,3,02:35:25,2009-06-25,100,0,False,...,95.17,4.0,0.0,0.0,0.0,1.0,0.0,0.0,2,0.875
4,0,2009-06-19 13:40:45,MissK20,I don't think changing my last name is meant t...,4,13:40:45,2009-06-19,52,0,False,...,102.61,4.4,0.0,0.066667,0.0,1.0,0.0,0.0,13,0.1875


In [64]:
#It is kind of forced but we can try doing similar thing with Weekday
df['skewed_week_dist'] = df['Weekday'].apply(lambda x: (2 - x) / 2 if x < 2 else (x - 2) / 4)
df.head()

Unnamed: 0,Target,Date,User,Text,Weekday,Time,Full_date,Length,Hashtags,HasHashtags,...,GFI,Polarity_TB,Subjectivity_TB,Positive_VADER,Neutral_VADER,Negative_VADER,Compound_VADER,Hour,skewed_hour_dist,skewed_week_dist
0,0,2009-06-25 00:52:47,timofuchs,@grazzly maaaaaate will you be at Glasto? Not ...,3,00:52:47,2009-06-25,67,0,False,...,2.2,0.0,0.0,0.0,1.0,0.0,0.0,0,1.0,0.25
1,1,2009-05-14 02:06:48,markdavidson,@Monkeylover35 Yeah. I had to get away for a b...,3,02:06:48,2009-05-14,93,0,False,...,3.4,0.5,0.5,0.29,0.71,0.0,0.5994,2,0.875,0.25
2,1,2009-05-04 07:14:48,Maekii,The World is just amazing!,0,07:14:48,2009-05-04,27,0,False,...,2.0,0.75,0.9,0.506,0.494,0.0,0.6239,7,0.5625,1.0
3,0,2009-06-25 02:35:25,MissElisee,Just picked my sister up from work. she's al...,3,02:35:25,2009-06-25,100,0,False,...,4.0,0.0,0.0,0.0,1.0,0.0,0.0,2,0.875,0.25
4,0,2009-06-19 13:40:45,MissK20,I don't think changing my last name is meant t...,4,13:40:45,2009-06-19,52,0,False,...,4.4,0.0,0.066667,0.0,1.0,0.0,0.0,13,0.1875,0.5


In [65]:
#We have seen that after certain date (2009-05-29 07:33:45) sentiment of all tweets is negative. Of course it is a feature
#strictly specific to our data and according to common sense shouldn't be used in the model if we would like to predict the sentiment
#of any text. However if testing is going to be done on our set alone it is a meaningful piece of information.
y = df[df['Target'] == 1]['Full_date'].max()
df['is_after_certain_day'] = df['Full_date'].apply(lambda x: 0 if x <= y else 1)
df.head()

Unnamed: 0,Target,Date,User,Text,Weekday,Time,Full_date,Length,Hashtags,HasHashtags,...,Polarity_TB,Subjectivity_TB,Positive_VADER,Neutral_VADER,Negative_VADER,Compound_VADER,Hour,skewed_hour_dist,skewed_week_dist,is_after_certain_day
0,0,2009-06-25 00:52:47,timofuchs,@grazzly maaaaaate will you be at Glasto? Not ...,3,00:52:47,2009-06-25,67,0,False,...,0.0,0.0,0.0,1.0,0.0,0.0,0,1.0,0.25,1
1,1,2009-05-14 02:06:48,markdavidson,@Monkeylover35 Yeah. I had to get away for a b...,3,02:06:48,2009-05-14,93,0,False,...,0.5,0.5,0.29,0.71,0.0,0.5994,2,0.875,0.25,0
2,1,2009-05-04 07:14:48,Maekii,The World is just amazing!,0,07:14:48,2009-05-04,27,0,False,...,0.75,0.9,0.506,0.494,0.0,0.6239,7,0.5625,1.0,0
3,0,2009-06-25 02:35:25,MissElisee,Just picked my sister up from work. she's al...,3,02:35:25,2009-06-25,100,0,False,...,0.0,0.0,0.0,1.0,0.0,0.0,2,0.875,0.25,1
4,0,2009-06-19 13:40:45,MissK20,I don't think changing my last name is meant t...,4,13:40:45,2009-06-19,52,0,False,...,0.0,0.066667,0.0,1.0,0.0,0.0,13,0.1875,0.5,1


In [66]:
#Thaks to one of correlation matrices from EDA (or to having at least 1 brain cell) we know that the number of hashtags, mentions or
#exclamation marks is highly correlated with its binary counterpart quantifier. From the same analysis we see that their correlation with 
# target is marginal however mentions and exclamation marks seem to have more to say than hashtags, that is why we 
#will leave out hashtags but not the other two.
def has_men(text):
    for c in text:
        if c == '@':
            return 1
    return 0


def has_exc(text):
    for c in text:
        if c == '!':
            return 1
        return 0


df['has_mentions'] = df['Text'].apply(has_men)
df['has_exclamation_marks'] = df['Text'].apply(has_exc)
df.head()

Unnamed: 0,Target,Date,User,Text,Weekday,Time,Full_date,Length,Hashtags,HasHashtags,...,Positive_VADER,Neutral_VADER,Negative_VADER,Compound_VADER,Hour,skewed_hour_dist,skewed_week_dist,is_after_certain_day,has_mentions,has_exclamation_marks
0,0,2009-06-25 00:52:47,timofuchs,@grazzly maaaaaate will you be at Glasto? Not ...,3,00:52:47,2009-06-25,67,0,False,...,0.0,1.0,0.0,0.0,0,1.0,0.25,1,1,0
1,1,2009-05-14 02:06:48,markdavidson,@Monkeylover35 Yeah. I had to get away for a b...,3,02:06:48,2009-05-14,93,0,False,...,0.29,0.71,0.0,0.5994,2,0.875,0.25,0,1,0
2,1,2009-05-04 07:14:48,Maekii,The World is just amazing!,0,07:14:48,2009-05-04,27,0,False,...,0.506,0.494,0.0,0.6239,7,0.5625,1.0,0,0,0
3,0,2009-06-25 02:35:25,MissElisee,Just picked my sister up from work. she's al...,3,02:35:25,2009-06-25,100,0,False,...,0.0,1.0,0.0,0.0,2,0.875,0.25,1,0,0
4,0,2009-06-19 13:40:45,MissK20,I don't think changing my last name is meant t...,4,13:40:45,2009-06-19,52,0,False,...,0.0,1.0,0.0,0.0,13,0.1875,0.5,1,0,0


In [67]:
#And now for the main course tf-idf, we will set a limit o a 100 words so as not to everything too high in computation cost.
from sklearn.feature_extraction.text import TfidfVectorizer

text_data = df['Text'].tolist()

#We shall also remove certain english stop words because they are very frequent but do not carry much sentiment.
vectoriser = TfidfVectorizer(max_features=50, lowercase=True, stop_words='english')
tfidf_matrix = vectoriser.fit_transform(text_data)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectoriser.get_feature_names_out())
df = pd.concat([df, tfidf_df], axis=1)
df.head()

Unnamed: 0,Target,Date,User,Text,Weekday,Time,Full_date,Length,Hashtags,HasHashtags,...,time,today,tomorrow,tonight,twitter,ve,want,week,wish,work
0,0,2009-06-25 00:52:47,timofuchs,@grazzly maaaaaate will you be at Glasto? Not ...,3,00:52:47,2009-06-25,67,0,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2009-05-14 02:06:48,markdavidson,@Monkeylover35 Yeah. I had to get away for a b...,3,02:06:48,2009-05-14,93,0,False,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,2009-05-04 07:14:48,Maekii,The World is just amazing!,0,07:14:48,2009-05-04,27,0,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,2009-06-25 02:35:25,MissElisee,Just picked my sister up from work. she's al...,3,02:35:25,2009-06-25,100,0,False,...,0.402267,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.745327
4,0,2009-06-19 13:40:45,MissK20,I don't think changing my last name is meant t...,4,13:40:45,2009-06-19,52,0,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [68]:
import re
import emoji
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

custom_stopwords = set(stopwords.words('english')) - {'no', 'not', 'nor', "isn't", "wasn't", "aren't", "don't",
                                                      "didn't", "cannot", "couldn't", "shouldn't"}


def preprocess_tweet(tweet):
    tweet = str(tweet).lower()
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    tweet = re.sub(r'\@\w+|\#', '', tweet)
    tweet = re.sub(r'\W', ' ', tweet)
    tweet = emoji.demojize(tweet, delimiters=("", " "))
    tokens = word_tokenize(tweet)
    tokens = [word for word in tokens if word not in custom_stopwords]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)


df['ProcessedText'] = df['Text'].apply(preprocess_tweet)

In [69]:
from gensim.models import Word2Vec

texts = df['ProcessedText'].tolist()

tokenized_texts = [text.split() for text in texts]

word2vec_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)


def tweet_vector(tweet, model):
    words = tweet.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)
    else:
        return np.mean(word_vectors, axis=0)

df["w2v_feature"] = df['ProcessedText'].apply(lambda tweet: tweet_vector(tweet, word2vec_model))

In [70]:
df.head()

Unnamed: 0,Target,Date,User,Text,Weekday,Time,Full_date,Length,Hashtags,HasHashtags,...,tomorrow,tonight,twitter,ve,want,week,wish,work,ProcessedText,w2v_feature
0,0,2009-06-25 00:52:47,timofuchs,@grazzly maaaaaate will you be at Glasto? Not ...,3,00:52:47,2009-06-25,67,0,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,maaaaaate glasto not going blissfeilds,"[-0.105313756, 0.282348, 0.008500764, -0.01027..."
1,1,2009-05-14 02:06:48,markdavidson,@Monkeylover35 Yeah. I had to get away for a b...,3,02:06:48,2009-05-14,93,0,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,yeah get away bit time time healthy,"[-0.17041533, 0.4474091, 0.003736068, -0.02707..."
2,1,2009-05-04 07:14:48,Maekii,The World is just amazing!,0,07:14:48,2009-05-04,27,0,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,world amazing,"[-0.0858928, 0.23000425, -0.00084036443, -0.01..."
3,0,2009-06-25 02:35:25,MissElisee,Just picked my sister up from work. she's al...,3,02:35:25,2009-06-25,100,0,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.745327,picked sister work always grumpz work think ne...,"[-0.1504961, 0.40778694, 0.0033369728, -0.0226..."
4,0,2009-06-19 13:40:45,MissK20,I don't think changing my last name is meant t...,4,13:40:45,2009-06-19,52,0,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,think changing last name meant,"[-0.11695038, 0.322793, 0.001300167, -0.020137..."


In [71]:
import nltk
from nltk.corpus import stopwords
import tiktoken
import numpy as np
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
stop_words = set(stopwords.words('english'))


def get_weighted_embedding(text):
    tokens = nltk.word_tokenize(text)
    tagged_tokens = nltk.pos_tag(tokens)
    embeddings = []
    for word, tag in tagged_tokens:
        if word in stop_words:
            continue
        embedding = encoding.encode(word)
        weight = 1.5 if tag.startswith('JJ') else 1.0
        embeddings.append(np.average(embedding) * weight)

    if len(embeddings) == 0:
        return 0
    weighted_embeddings = np.average(embeddings, axis=0)
    return weighted_embeddings


df["tt_embedding_weighted"] = np.array([get_weighted_embedding(text) for text in texts])

In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer

texts = df['ProcessedText']
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
tfidf_array = tfidf_matrix.toarray()
feature_names = tfidf_vectorizer.get_feature_names_out()
def top_n_words(row, feature_names, n=3):
    top_n_idx = np.argsort(row)[-n:]
    top_n_values = [feature_names[i] for i in top_n_idx]
    return top_n_values
df[["Word_1", "Word_2", "Word_3"]] = [top_n_words(row, feature_names) for row in tfidf_array]

In [73]:
def get_tiktoken_embedding(text):
    embeddings = encoding.encode(text)
    if len(embeddings) == 0:
        return 0, 0, 0
    return np.mean(embeddings, axis=0), np.max(embeddings, axis=0), np.min(embeddings, axis=0)

for i in range(1, 4):
    words = df[f"Word_{i}"]
    df[[f"Word_{i}_embedding_mean", f"Word_{i}_embedding_max", f"Word_{i}_embedding_min"]] = np.array(
        [get_tiktoken_embedding(word) for word in words])

In [74]:
from sklearn.decomposition import LatentDirichletAllocation


n_topics = 10
lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
lda.fit(tfidf_matrix)
topic_distribution = lda.transform(tfidf_matrix)
dominant_topic = np.argmax(topic_distribution, axis=1)
df['DominantTopic'] = dominant_topic

In [76]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

def generate_sentence_embeddings(texts, model):
    return model.encode(texts, batch_size=32, show_progress_bar=True)

texts = df['ProcessedText'].tolist()
embeddings = generate_sentence_embeddings(texts, model)
embeddings_list = embeddings.tolist()
df['sentence_embeddings'] = embeddings_list
for i in range(len(embeddings[0])):
    df[f'embedding_{i}'] = df['sentence_embeddings'].apply(lambda x: x[i])


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

  df[f'embedding_{i}'] = df['sentence_embeddings'].apply(lambda x: x[i])
  df[f'embedding_{i}'] = df['sentence_embeddings'].apply(lambda x: x[i])
  df[f'embedding_{i}'] = df['sentence_embeddings'].apply(lambda x: x[i])
  df[f'embedding_{i}'] = df['sentence_embeddings'].apply(lambda x: x[i])
  df[f'embedding_{i}'] = df['sentence_embeddings'].apply(lambda x: x[i])
  df[f'embedding_{i}'] = df['sentence_embeddings'].apply(lambda x: x[i])
  df[f'embedding_{i}'] = df['sentence_embeddings'].apply(lambda x: x[i])
  df[f'embedding_{i}'] = df['sentence_embeddings'].apply(lambda x: x[i])
  df[f'embedding_{i}'] = df['sentence_embeddings'].apply(lambda x: x[i])
  df[f'embedding_{i}'] = df['sentence_embeddings'].apply(lambda x: x[i])
  df[f'embedding_{i}'] = df['sentence_embeddings'].apply(lambda x: x[i])
  df[f'embedding_{i}'] = df['sentence_embeddings'].apply(lambda x: x[i])
  df[f'embedding_{i}'] = df['sentence_embeddings'].apply(lambda x: x[i])
  df[f'embedding_{i}'] = df['sentence_embeddings'].

In [78]:
import spacy

nlp = spacy.load("en_core_web_sm")

def get_named_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

df['NamedEntities'] = df['Text'].apply(get_named_entities)
all_labels = set([label for sublist in df['NamedEntities'] for _, label in sublist])

for label in all_labels:
    df[f'Count_{label}'] = 0

def update_label_counts(row):
    label_counts = {label: 0 for label in all_labels}

    for _, label in row['NamedEntities']:
        label_counts[label] += 1

    for label, count in label_counts.items():
        row[f'Count_{label}'] = count

    return row

df = df.apply(update_label_counts, axis=1)

  df['NamedEntities'] = df['Text'].apply(get_named_entities)
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0
  df[f'Count_{label}'] = 0


In [80]:
for i in range(word2vec_model.vector_size):
    df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])
    
df.drop(['w2v_feature', 'NamedEntities'], axis=1, inplace=True)

  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])
  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])
  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])
  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])
  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])
  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])
  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])
  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])
  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])
  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])
  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])
  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])
  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])
  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])
  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])
  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i])
  df[f'w2v_{i}'] = df['w2v_feature'].apply(lambda x: x[i

In [81]:
df.to_csv("./Data/tweets_features.csv", index=False)