In [1]:
import pandas as pd
from tqdm.notebook import tqdm
import numpy as np
import re
import math
import pickle
import ast
import matplotlib.pyplot as plt
import preprocessor as p
from sentence_transformers import SentenceTransformer

p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION)
model = SentenceTransformer('all-MiniLM-L6-v2')

In [3]:
df = pd.read_csv('data/twitter_vaccination_dataset/master.csv')
df['conversation_id'] = df['conversation_id'].astype(int)
df['user_id'] = df['user_id'].astype(int)
df['id'] = df['id'].astype(int)

conversations1 = df[df['conversation_id']
                   .map(df['conversation_id']
                   .value_counts()) >= 5000]['conversation_id'].unique()

conversations2 = df[df['conversation_id']
                   .map(df['conversation_id']
                   .value_counts()) >= 5]['conversation_id'].unique()

conversations = set(conversations2) - set(conversations1)
df_replies = df.query('conversation_id in @conversations')

  df = pd.read_csv('data/twitter_vaccination_dataset/master.csv')


In [4]:
def preprocess(text):
    if 'RT' in text:
        return None
    cleaned_text = p.clean(text)
    cleaned_text = ' '.join(re.findall('[a-z]+', cleaned_text, flags=re.IGNORECASE)).lower()
    if cleaned_text == '':
        return None
    return cleaned_text

In [8]:
cleaned_tweets = [preprocess(tweet) for tweet in tqdm(df_replies['tweet'])]
df_replies['cleaned_tweet'] = cleaned_tweets
df_replies = df_replies.dropna(subset=['cleaned_tweet'])
df_replies['reply_to'] = df_replies['reply_to'].apply(ast.literal_eval)
df_replies = df_replies.reset_index(drop=True)

df_replies['embeddings'] = None
embeddings = model.encode(df_replies['cleaned_tweet'].values)

for i in tqdm(df_replies.index):
    df_replies['embeddings'][i] = embeddings[i]

with open('data/df_replies.pickle', 'wb') as f:
    pickle.dump(df_replies[['conversation_id',
                            'user_id',
                            'username',
                            'tweet',
                            'mentions',
                            'replies_count',
                            'retweets_count',
                            'likes_count',
                            'hashtags',
                            'reply_to',
                            'cleaned_tweet',
                            'embeddings'
                            ]], f)
df_replies

  0%|          | 0/1351128 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_replies['embeddings'][i] = embeddings[i]


Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,...,near,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,cleaned_tweet,embeddings
0,1194918582,1193730372,1.234260e+12,10/02/2009,12:08:53,E. Africa Standard Time,15820442,tomwhild,Tom Whild,,...,,,,,,,"[{'user_id': '15820442', 'username': 'tomwhild...",,amen stephen vaccination is so important i hat...,"[-0.0054706745, 0.09844217, 0.075048484, 0.013..."
1,1194814680,1193730372,1.234250e+12,10/02/2009,11:04:52,E. Africa Standard Time,2276201,sevitz,Citizen Sev 🤷🏻‍♂️,,...,,,,,,,"[{'user_id': '2276201', 'username': 'sevitz'},...",,vaccination scares are so bloody irresponsible...,"[0.04750795, 0.03788834, 0.0015141966, 0.04019..."
2,1194173719,1193730372,1.234230e+12,10/02/2009,05:45:13,E. Africa Standard Time,12488312,elbisivni,Charles 🇳🇱🇸🇷🇦🇺,,...,,,,,,,"[{'user_id': '12488312', 'username': 'elbisivn...",,this chap needs our support vaccination scares...,"[-0.0070269336, 0.043089196, 0.014950992, -0.0..."
3,1193850980,1193730372,1.234230e+12,10/02/2009,03:53:17,E. Africa Standard Time,7461972,atariageguy,atariageguy,,...,,,,,,,"[{'user_id': '7461972', 'username': 'atariageg...",,yes this is infuriating especially now that we...,"[-0.022819221, 0.02312067, 0.025248187, -0.012..."
4,1193784560,1193730372,1.234230e+12,10/02/2009,03:27:44,E. Africa Standard Time,15022458,weaselbacon,Carolyn Hastings,,...,,,,,,,"[{'user_id': '15022458', 'username': 'weaselba...",,thanks for that vaccination is one of my pet i...,"[-0.014918114, 0.041756548, 0.0010046107, -9.9..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1351123,1200000000000000000,964000000000000000,1.570000e+12,25/11/2019,11:27:11,E. Africa Standard Time,766000000000000000,pancaraccia,Mimosa,,...,,,,,,,"[{'user_id': '765943276377628673', 'username':...",,le virus de la variole se transmet exclusiveme...,"[0.03683498, 0.06865424, -0.013673961, -0.0267..."
1351124,1200000000000000000,941000000000000000,1.570000e+12,25/11/2019,05:26:07,E. Africa Standard Time,849805975,realchaim_rubin,Chaim❌Rubin He/Him,,...,,,,,,,"[{'user_id': '849805975', 'username': 'realCha...",,calling it mutilation is not convincing just m...,"[-0.0052796206, 0.015638433, 0.009222518, -0.0..."
1351125,1200000000000000000,961000000000000000,1.570000e+12,26/11/2019,15:31:47,E. Africa Standard Time,743000000000000000,suraiyahuss,Suraiya DM 🚫,,...,,,,,,,"[{'user_id': '742765890735869954', 'username':...",,in the english doctor edward jenner develops t...,"[0.007919503, 0.08331335, 0.031874426, -0.0441..."
1351126,1200000000000000000,941000000000000000,1.570000e+12,27/11/2019,01:54:30,E. Africa Standard Time,976000000000000000,dkingpower7,ProtectChildren,,...,,,,,,,"[{'user_id': '976478598117851136', 'username':...",,no circumcision is nothing like vaccination on...,"[0.07487217, 0.09335318, -0.043020695, 0.03961..."
