In [None]:
# import all the necessary libraries to run the code
from gensim.corpora.dictionary import Dictionary
from IPython.display import clear_output
from gensim.test.utils import datapath
import pyLDAvis.gensim as gensimvis
from gensim.models import LdaModel
import gensim.corpora as corpora
from pymongo import MongoClient
import matplotlib.pyplot as plt
from operator import itemgetter
from pprint import pprint
import pandas as pd
import datetime
import pyLDAvis
import requests
import pymongo
import gensim
import tweepy
import string
import numpy
import spacy
import emoji
import nltk
import json
import time
import os
import re

In [None]:
# setup database and tables names
dbname = 'vaccinitaly'

client = MongoClient('localhost', 27017)
tweet_collection = client[dbname]['labeled_tweets']
complete_collection = client[dbname]['vaccines_tweets']
no_vax_users = client[dbname]['no_vax_users']
no_vax_users_cleaned = client[dbname]['no_vax_users_cleaned']
last_100_no_vax_tweets_cleaned = client[dbname]['last_100_no_vax_tweets_per_user_cleaned']

<h3>1.1 - Retrieve the users associated with the classified tweets without users</h3>

In [None]:
def update_tweet_user(tweet_id, main_collection, user):
    query = {"id": tweet_id}
    
    new_values = {
        "$set": {
            "user": user
        }
    }
    
    main_collection.update_one(query, new_values)

In [None]:
all_labeled_tweets = tweet_collection.find({"user": None}, no_cursor_timeout = True)
all_labeled_tweets_count = tweet_collection.count_documents({"user": None})

In [None]:
# retrieve users from the complete collection to complete the labeled collection
tweet_processed_count = 0
for tweet in all_labeled_tweets:
    
    complete_tweet = complete_collection.find_one({"id": tweet['id']})
    
    if complete_tweet != None:
        update_tweet_user(tweet['id'], tweet_collection, complete_tweet['user'])
    
    tweet_processed_count += 1
    print("Pre-Processed ({:.2f} %) -".format(float(tweet_processed_count)/float(all_labeled_tweets_count) * 100), tweet_processed_count,"/", all_labeled_tweets_count)
    clear_output(wait=True)

<h3>1.2 - Collect from Twitter the users associated with the <b>classified</b> tweets without users</h3>

In [None]:
auth = tweepy.OAuthHandler("", "")
auth.set_access_token("", "")
api = tweepy.API(auth, wait_on_rate_limit = True, wait_on_rate_limit_notify = True)

In [None]:
def update_user_look(tweet_id, main_collection, bool_):
    query = {"id": tweet_id}
    new_values = {
        "$set": {
            "looked_for_user": bool_
        }
    }
    main_collection.update_one(query, new_values)

In [None]:
labeled_tweets = tweet_collection.find({}, no_cursor_timeout = True)
labeled_tweets_count = tweet_collection.count_documents({})

In [None]:
# setup whether the user has been found or not
processed_count = 0
for tweet in labeled_tweets:
    if 'user' in tweet and tweet['user'] != None:
        update_user_look(tweet['id'], tweet_collection, True)
    else:
        update_user_look(tweet['id'], tweet_collection, False)
    
    processed_count += 1
    print("Processed ({:.2f} %) -".format(float(processed_count)/float(labeled_tweets_count) * 100), processed_count,"/", labeled_tweets_count)
    clear_output(wait=True)

In [None]:
def save_tweet_user(tweet_id, main_collection, user):
    query = {"id": tweet_id}
    new_values = {
        "$set": {
            "user": user,
            "looked_for_user": True
        }
    }
    main_collection.update_one(query, new_values)

In [None]:
def collect_missing_users():
    labeled_tweets_without_user = tweet_collection.find({"user": None, "looked_for_user": False}, no_cursor_timeout = True)
    labeled_tweets_without_user_count = tweet_collection.count_documents({"user": None, "looked_for_user": False})
    
    processed_count = 0

    for tweet in labeled_tweets_without_user:
        try:
            retr_tweet = api.get_status(tweet['id'])
            save_tweet_user(tweet['id'], tweet_collection, retr_tweet.user._json)
        except:
            update_user_look(tweet['id'], tweet_collection, True)
            processed_count += 1
            print("Processed ({:.2f} %) -".format(float(processed_count)/float(labeled_tweets_without_user_count) * 100), processed_count,"/", labeled_tweets_without_user_count)
            clear_output(wait=True)
            continue
        
        processed_count += 1
        print("Processed ({:.2f} %) -".format(float(processed_count)/float(labeled_tweets_without_user_count) * 100), processed_count,"/", labeled_tweets_without_user_count)
        clear_output(wait=True)

In [None]:
try:
    collect_missing_users()
except:
    collect_missing_users()

<h3>1.3 - Build the collection of no-vax users</h3>

In [None]:
def update_user_no_vax_percentage(user_id, main_collection, no_vax_percentage):
    query = {"id": user_id}
    
    new_values = {
        "$set": {
            "no_vax_percentage": no_vax_percentage
        }
    }
    
    main_collection.update_one(query, new_values)

In [None]:
all_labeled_tweets = tweet_collection.find({"user": {"$ne": None}})
all_labeled_tweets_count = tweet_collection.count_documents({"user": {"$ne": None}})

In [None]:
# golden hashtags
neutral = ["vaccini", "vaccino", "vaccinazioni", "vaccinazione", "novaccinoainovax", "vaccinocovid", "vaccinoanticovid"]
pro_vax = ["vaccinarsi", "vaccinerai", "vaccinare", "vaccineremo", "vacciniamoci", "vaccinerete", "vaccinareh24", "vaccinerò"]
no_vax = ["iononmivaccino", "iononmivaccinero", "iononsonounacavia"]

no_vax_limit = 0.8
tweet_processed_count = 0
no_vax_user_count = 0
no_vax_users_detected = []
analysed_users = []

for tweet in all_labeled_tweets:
    # ignore if already analysed
    if tweet['user']['id'] in analysed_users:
        continue
    
    # check if the tweet could be a no-vax one depending on the classification keepinig only the predictions with score > 0.8
    all_pred = [tweet['pred1'], tweet['pred2'], tweet['pred3'], tweet['pred4'], tweet['pred5']]
    no_vax_majority = sum(float(pred) > no_vax_limit for pred in all_pred)
    
    # if so
    if no_vax_majority > round(len(all_pred)/2, 0):
        # add the user to the list of already analysed users
        analysed_users.append(tweet['user']['id'])
        
        no_vax_users.insert_one(tweet['user'])
        
        count_all = 0
        count_no_vax = 0
        
        # collect all the tweets
        no_vax_user_tweets = tweet_collection.find({'user.id': tweet['user']['id']})
        
        for user_tweet in no_vax_user_tweets:
            is_no_vax = False
            
            # if it contains at least one hashtag
            if len(user_tweet['hashtags']) > 0:
                count_all += 1

                # compute if only one of those hashtag is a golden one or not
                for no_vax_hash in no_vax:
                    if no_vax_hash in user_tweet['hashtags']:
                        is_no_vax = True
                        break

                # check majority on prediction scores
                all_pred = [user_tweet['pred1'], user_tweet['pred2'], user_tweet['pred3'], user_tweet['pred4'], user_tweet['pred5']]
                no_vax_majority = sum(float(pred) > no_vax_limit for pred in all_pred)

                if no_vax_majority > round(len(all_pred)/2, 0):
                    is_no_vax = True

                # increase no-vax count
                if is_no_vax:
                    count_no_vax += 1

        # compute "no-vaxity" percentage
        no_vax_perc = 0
        if count_all != 0:
            no_vax_perc = round(float(count_no_vax)/float(count_all),2)
    
        # update user
        update_user_no_vax_percentage(tweet['user']['id'], no_vax_users, no_vax_perc)

    # count processed tweets
    tweet_processed_count += 1
    print("Pre-Processed ({:.2f} %) -".format(float(tweet_processed_count)/float(all_labeled_tweets_count) * 100), tweet_processed_count,"/", all_labeled_tweets_count)
    clear_output(wait=True)

<h3>2 - Collect the last 100 tweets shared by the users classified as no-vax</h3>

In [None]:
# setup content for tweet collection
bearer_token = ""
field_list = "attachments,author_id,context_annotations,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,public_metrics,possibly_sensitive,referenced_tweets,reply_settings,source,text,withheld"
search_url = "https://api.twitter.com/2/tweets/search/all"

tweet_limit = 100
last_100_no_vax_tweets = client['vaccinitaly']['last_' + str(tweet_limit) + '_no_vax_tweets_per_user']

In [None]:
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

In [None]:
def connect_to_endpoint(search_url, headers, params):
    response = requests.request("GET", search_url, headers=headers, params=params)

    return response.json(), response.headers

In [None]:
def mark_as_processed(tweet_id, main_collection):
    query = {"id": tweet_id}
    new_values = {
        "$set": {
            "processed": True
        }
    }
    main_collection.update_one(query, new_values)

In [None]:
user_count = no_vax_users.count_documents({})
user_ids = no_vax_users.find({}, no_cursor_timeout = True)

In [None]:
total_collected = 0
processed_count = 0

headers = create_headers(bearer_token)

# collect last 100 tweets for users
for user in user_ids:
    current_collected = 0
    processed = False

    if 'processed' not in user or not user['processed']:

        mark_as_processed(user['id'], no_vax_users)

        if last_100_no_vax_tweets.count_documents({"author_id": str(user['id'])}) == 0:
            processed = True
            from_ = 'from: {}'.format(user['id'])
            query_params = {'query': from_, 'max_results': tweet_limit, 'tweet.fields': field_list}

            json_response, response_headers = connect_to_endpoint(search_url, headers, query_params)

            if "status" not in json_response and response_headers.get('x-rate-limit-remaining') is not None and int(response_headers.get('x-rate-limit-remaining')) > 0:
                if 'result_count' in json_response['meta'] and json_response['meta']['result_count'] > 0:

                    for el in json_response["data"]:
                        last_100_no_vax_tweets.insert_one(el)
                        current_collected += 1
                        total_collected += 1

                print("Collected Tweets =", current_collected,"Total Tweets =", total_collected)
            else:
                if response_headers.get('x-rate-limit-remaining') is not None:
                    sleep_time = int(response_headers.get('x-rate-limit-reset')) - int(time.time())
                    print("Sleeping for",sleep_time + 1,"s")
                    time.sleep(sleep_time + 1)
                    print("I'm Awake")

    processed_count += 1
    print("Processed ({:.2f} %) -".format(float(processed_count)/float(user_count) * 100), processed_count,"/", user_count)

    if processed:
        time.sleep(1)

    clear_output(wait=True)

<h3>2.1 - Collect the original tweets of the retweets of the users classified as no-vax</h3>

In [None]:
def create_url(ids):
    tweet_fields = "tweet.fields=" + field_list
    ids_list = "ids=" + ids
    
    url = "https://api.twitter.com/2/tweets?{}&{}".format(ids_list, tweet_fields)
    
    return url

In [None]:
def add_collected_original(tweet_id, main_collection, original_tweet):
    query = {"id": tweet_id}
    new_values = {
        "$set": {
            "original_tweet": original_tweet
        }
    }
    main_collection.update_one(query, new_values)

In [None]:
def mark_suspended_account_tweet(tweet_id, main_collection):
    query = {"id": tweet_id}
    new_values = {
        "$set": {
            "suspended": True
        }
    }
    main_collection.update_one(query, new_values)

In [None]:
def mark_deleted_account_tweet(tweet_id, main_collection):
    query = {"id": tweet_id}
    new_values = {
        "$set": {
            "deleted": True
        }
    }
    main_collection.update_one(query, new_values)

In [None]:
retweets = last_100_no_vax_tweets.find({"referenced_tweets": {"$ne": None}}, no_cursor_timeout = True)
retweets_count = last_100_no_vax_tweets.count_documents({"referenced_tweets": {"$ne": None}})

In [None]:
headers = create_headers(bearer_token)

request_ids = ""
processed_count = 0
ids_count = 0
mappings = []

# collect the original tweet from the referenced tweet
for rt in retweets:
    processed = False

    if ('original_tweet' not in rt or not rt['original_tweet']) and ids_count < 100:

        if request_ids != "":
            request_ids += ","

        request_ids += rt['referenced_tweets'][0]['id']
        ids_count += 1

        mappings.append({'retweet_id': rt['id'], 'original_tweet_id': rt['referenced_tweets'][0]['id'], 'elaborated': 0})

    if ids_count == 100:
        json_response, response_headers = connect_to_endpoint(create_url(request_ids), headers)

        if "status" not in json_response and response_headers.get('x-rate-limit-remaining') is not None and int(response_headers.get('x-rate-limit-remaining')) > 0:

            df = pd.DataFrame(mappings)

            if "data" in json_response and len(json_response["data"]) > 0:
                for el in json_response["data"]:

                    matching_retweets = df.loc[(df['original_tweet_id'] == el['id']) & (df['elaborated'] == 0)]['retweet_id'].tolist()

                    for m in matching_retweets:
                        add_collected_original(m, last_100_no_vax_tweets, el)

                    df.loc[df['original_tweet_id'] == el['id'], ['elaborated']] = 1

            if "errors" in json_response and len(json_response["errors"]) > 0:
                for err in json_response["errors"]:
                    # suspended account
                    if err['title'] == "Authorization Error":
                        mark_deleted_account_tweet(err['value'], last_100_no_vax_tweets)

                    # deleted account
                    if err['title'] == "Not Found Error":
                        mark_deleted_account_tweet(err['value'], last_100_no_vax_tweets)

            ids_count = 0;
            request_ids = ""
            mappings = []
            processed = True
        else:
            if response_headers.get('x-rate-limit-remaining') is not None:
                sleep_time = int(response_headers.get('x-rate-limit-reset')) - int(time.time())
                print("Sleeping for",sleep_time + 1,"s")
                time.sleep(sleep_time + 1)
                print("I'm Awake")

    processed_count += 1
    print("Processed ({:.2f} %) -".format(float(processed_count)/float(retweets_count) * 100), processed_count,"/", retweets_count)

    if processed:
        time.sleep(1)

    clear_output(wait=True)

<h3>3 - Integrate the bot score associated with the users from botometer</h3>

In [None]:
def is_bot_update(author_id, bot_score, main_collection):
    query = {"id": author_id}
    is_bot = False
        
    if bot_score >= 0.5:
        is_bot = True
    
    new_values = {
        "$set": {
            "user_bot_score": bot_score,
            "is_bot": is_bot
        }
    }
    
    main_collection.update_one(query, new_values)

In [None]:
file_name = 'no_vax_users_bot_score.csv'
csv = pd.read_csv (file_name, header = 0)
df = pd.DataFrame(csv, columns= ['user_id', 'user_bot_score'])

user_count = no_vax_users.count_documents({})

In [None]:
processed_users = 0
for row in csv.itertuples():
    is_bot_update(int(row.user_id), float(row.user_bot_score), no_vax_users)
    processed_users += 1
    
    print("Processed ({:.2f} %) -".format(float(processed_users)/float(user_count) * 100), processed_users,"/", user_count)
    clear_output(wait=True)

<h3>4 - Define whether a user is a no-vax or not depending on the collected content</h3>

In [None]:
def update_collected_tweet_count(author_id, count, main_collection):
    query = {"id": author_id}
    
    new_values = {
        "$set": {
            "collected_tweet_count": count
        }
    }
    
    main_collection.update_one(query, new_values)

In [None]:
# consider all the no-vax users for which I collected the data
no_vax_users_retr = no_vax_users.find({})
user_count = no_vax_users.count_documents({})

In [None]:
processed_users = 0
no_vax_tweets_authors = []

# for each supposed no-vax user, store the number of tweets shared
for user in no_vax_users_retr:
    count_tweets = last_100_no_vax_tweets.count_documents({"author_id": str(user['id'])})
    
    if count_tweets > 0:
        no_vax_tweets_authors.append(user)
    
    update_collected_tweet_count(user['id'], count_tweets, no_vax_users)
    
    processed_users += 1
    print("Processed ({:.2f} %) -".format(float(processed_users)/float(user_count) * 100), processed_users,"/", user_count)
    clear_output(wait=True)

In [None]:
def set_tweet_frequency(author_id, main_collection, freq):
    query = {"id": author_id}
    
    new_values = {
        "$set": {
            "max_tweet_frequency_per_minute": freq
        }
    }
    
    main_collection.update_one(query, new_values)

In [None]:
no_vax_users_retr = no_vax_users.find({})
user_count = no_vax_users.count_documents({})

In [None]:
# compute the sharing frequency for each user
processed_users = 0
for user in no_vax_users_retr:
    if user['collected_tweet_count'] > 0:
        # get all the original tweet from the list of users for which there are tweets, ignoring the retweets
        user_tweets = last_100_no_vax_tweets.find({"author_id": str(user['id']), "referenced_tweets": {"$eq": None}}).sort("created_at", pymongo.DESCENDING)
        
        # to array
        user_tweets_array = []
        for tweet in user_tweets:
            user_tweets_array.append(tweet)

        if len(user_tweets_array) > 0:
            user_tweet_frequency = []
            while True:
                # compute time range
                recent_datetime = datetime.datetime.strptime(user_tweets_array[0]['created_at'], "%Y-%m-%dT%H:%M:%S.%fZ")
                delta_time = datetime.timedelta(minutes = 1)
                before_datetime = recent_datetime - delta_time

                # count published tweets in that time range (count and remove them from array)
                to_be_removed = []
                published_tweets_count = 0
                for tweet in user_tweets_array:
                    if datetime.datetime.strptime(tweet['created_at'], "%Y-%m-%dT%H:%M:%S.%fZ") >= before_datetime and datetime.datetime.strptime(tweet['created_at'], "%Y-%m-%dT%H:%M:%S.%fZ") <= recent_datetime:
                        published_tweets_count += 1
                        to_be_removed.append(tweet)

                for tbr in to_be_removed:
                    user_tweets_array.remove(tbr)

                user_tweet_frequency.append(published_tweets_count)

                if len(user_tweets_array) == 0:
                    break

            # update
            set_tweet_frequency(user['id'], no_vax_users, numpy.max(user_tweet_frequency).item())
    
    processed_users += 1
    print("Processed ({:.2f} %) -".format(float(processed_users)/float(empty_no_vax_users_count) * 100), processed_users,"/", empty_no_vax_users_count)
    clear_output(wait=True)

In [None]:
def ignore_user(author_id, main_collection, boolean):
    query = {"id": author_id}
    
    new_values = {
        "$set": {
            "ignored": boolean
        }
    }
    
    main_collection.update_one(query, new_values)

In [None]:
def ignore_user_tweet(tweet_id, main_collection):
    query = {"id": tweet_id}
    
    new_values = {
        "$set": {
            "ignored": True
        }
    }
    
    main_collection.update_one(query, new_values)

In [None]:
all_no_vax_users = no_vax_users.find({})
to_be_ignored_users = no_vax_users.find({"$or":[{"collected_tweet_count": 0}, {"verified": True}, {"is_bot": True}, {"max_tweet_frequency_per_minute": {"$gt": 2}}, {"no_vax_percentage": {"$lt": 0.6}}]})
to_be_ignored_users_count = no_vax_users.count_documents({"$or":[{"collected_tweet_count": 0}, {"verified": True}, {"is_bot": True}, {"max_tweet_frequency_per_minute": {"$gt": 2}}, {"no_vax_percentage": {"$lt": 0.6}}]})

In [None]:
# ignore users that have either no shared tweets or are marked as bot or are verified users or have a tweet frequency greater
# than 2 per minute or have a percentage of no-vax tweets shared < 0.6
processed_users = 0

for user in all_no_vax_users:
    ignore_user(user['id'], no_vax_users, False)

for user in to_be_ignored_users:
    ignore_user(user['id'], no_vax_users, True)
    
    for user_tweet in last_100_no_vax_tweets.find({"author_id": str(user['id'])}):
        ignore_user_tweet(user_tweet['id'], last_100_no_vax_tweets)
    
    processed_users += 1
    print("Processed ({:.2f} %) -".format(float(processed_users)/float(to_be_ignored_users_count) * 100), processed_users,"/", to_be_ignored_users_count)
    clear_output(wait=True)

<h3>5 - Data cleaning step for topic analysis</h3>

In [None]:
nltk.download('stopwords')
stopword = nltk.corpus.stopwords.words('italian')

In [None]:
no_vax_user_keep = no_vax_users.find({"ignored": False})
no_vax_user_keep_count = no_vax_users.count_documents({"ignored": False})

In [None]:
# create a new collection to simplify the analysis
processed_count = 0
for nvuk in no_vax_user_keep:
    no_vax_users_cleaned.insert_one(nvuk)
    
    no_vax_tweets_keep = last_100_no_vax_tweets.find({"author_id": str(nvuk['id'])})
    for tw in no_vax_tweets_keep:
        last_100_no_vax_tweets_cleaned.insert_one(tw)
        
    processed_count += 1
    print("Processed ({:.2f} %) -".format(float(processed_count)/float(no_vax_user_keep_count) * 100), processed_count,"/", no_vax_user_keep_count)
    clear_output(wait=True)

In [None]:
def text_cleaner(tweet):
    
    # remove quotes
    quotes = re.findall("@[A-Za-z0-9]+", tweet)
    tweet = re.sub("@[A-Za-z0-9]+", "", tweet)
    
    # remove http links
    urls = re.findall(r"(?:\@|http?\://|https?\://|www)\S+", tweet)
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet)
    
    # remove emojis
    emojis = [c for c in tweet if c in emoji.UNICODE_EMOJI['en'] or c in emoji.UNICODE_EMOJI['it']]
    tweet = ''.join(c for c in tweet if c not in emoji.UNICODE_EMOJI['en'] and c not in emoji.UNICODE_EMOJI['it'])
    
    # substitute apices with spaces
    # tweet = tweet.replace("'", " ")
    while True:
        apices_pos = tweet.find("'")
        
        if apices_pos == -1:
            break
            
        sub_tweet = tweet[0:apices_pos]
        last_space_index_before_apices = sub_tweet.rfind(" ")
        
        if last_space_index_before_apices == -1:
            last_space_index_before_apices = 0
        
        tweet = tweet[0:last_space_index_before_apices] + tweet[apices_pos + 1:len(tweet)]
    
    # remove punctuation including hashtag sign
    tweet_with_hash_text = ''.join(c.lower() for c in tweet if c not in string.punctuation)
    
    # remove spaces
    tweet_with_hash_text = ' '.join(tweet_with_hash_text.split())
    
    # tokenize
    token_text_with_hash = re.split('\W+', tweet_with_hash_text)
    
    token_text_with_hash_no_stop = [word for word in token_text_with_hash if word not in stopword and word != '']
    
    # remove hashtags
    hashtags = re.findall("#[^ ]*", tweet)
    tweet = re.sub("#[^ ]*", "", tweet)
    
    # cleaned tweet with punctuation
    tweet = tweet.strip()
    
    # remove punctuation
    no_punct_tweet = ''.join(c.lower() for c in tweet if c not in string.punctuation)
    
    no_punct_tweet = ' '.join(no_punct_tweet.split())
    
    # re-organize and stemming content
    text_lc = "".join([word.lower() for word in no_punct_tweet])
    
    text_rc = re.sub('[0-9]+', '', text_lc)
    
    token_text = re.split('\W+', text_rc) # tokenization
    
    token_text = [word for word in token_text if word not in stopword and word != '']  # remove stopwords
     
    return quotes, urls, emojis, hashtags, token_text, token_text_with_hash_no_stop, no_punct_tweet, tweet_with_hash_text

In [None]:
def update_tweet(tweet_id, main_collection, extracted_quotes, extracted_urls, extracted_emojis, extracted_hashtags, extracted_token_text_no_hashtags, extracted_token_text_with_hashtags, extracted_no_punct_tweet, extracted_tweet_with_hash_text):
    query = {"id": tweet_id}
    
    new_values = {
        "$set": {
            "extracted_quotes": extracted_quotes,
            "extracted_urls": extracted_urls,
            "extracted_emojis": extracted_emojis,
            "extracted_hashtags": extracted_hashtags,
            "extracted_token_text_no_hashtags": extracted_token_text_no_hashtags,
            "extracted_token_text_with_hashtags": extracted_token_text_with_hashtags,
            "extracted_clean_tweet_no_hashtags": extracted_no_punct_tweet,
            "extracted_clean_tweet_with_hashtags": extracted_tweet_with_hash_text,
            "has_text_processed": True
        }
    }
    
    main_collection.update_one(query, new_values)

In [None]:
def add_has_text_processed(tweet_id, main_collection):
    query = {"id": tweet_id}
    
    new_values = {
        "$set": {
            "has_text_processed": False
        }
    }
    
    main_collection.update_one(query, new_values)

In [None]:
all_tweets = last_100_no_vax_tweets_cleaned.find({}, no_cursor_timeout = True)
all_tweets_count = last_100_no_vax_tweets_cleaned.count_documents({})

In [None]:
# setup a new column
pre_tweet_processed_count = 0
for tweet in all_tweets:
    add_has_text_processed(tweet['id'], last_100_no_vax_tweets_cleaned)
    
    pre_tweet_processed_count += 1
    print("Pre-Processed ({:.2f} %) -".format(float(pre_tweet_processed_count)/float(all_tweets_count) * 100), pre_tweet_processed_count,"/", all_tweets_count)
    clear_output(wait=True)

In [None]:
all_tweets = last_100_no_vax_tweets_cleaned.find({"has_text_processed": {"$eq": False}}, no_cursor_timeout = True)
all_tweets_count = last_100_no_vax_tweets_cleaned.count_documents({"has_text_processed": False})

In [None]:
# data cleaning for each tweet
tweet_processed_count = 0
for tweet in all_tweets:
    if 'referenced_tweets' in tweet and tweet['referenced_tweets'][0]['type'] == 'retweeted':
        if 'original_tweet' in tweet:
            t_quotes, t_urls, t_emojis, t_hashtags, t_token_text, t_token_text_with_hash, t_no_punct_tweet, t_tweet_with_hash_text = text_cleaner(tweet['original_tweet']['text'])
            update_tweet(tweet['id'], last_100_no_vax_tweets_cleaned, t_quotes, t_urls, t_emojis, t_hashtags, t_token_text, t_token_text_with_hash, t_no_punct_tweet, t_tweet_with_hash_text)
    else:
        t_quotes, t_urls, t_emojis, t_hashtags, t_token_text, t_token_text_with_hash, t_no_punct_tweet, t_tweet_with_hash_text = text_cleaner(tweet['text'])
        update_tweet(tweet['id'], last_100_no_vax_tweets_cleaned, t_quotes, t_urls, t_emojis, t_hashtags, t_token_text, t_token_text_with_hash, t_no_punct_tweet, t_tweet_with_hash_text)

    tweet_processed_count += 1
    print("Processed ({:.2f} %) -".format(float(tweet_processed_count)/float(all_tweets_count) * 100), tweet_processed_count,"/", all_tweets_count)
    clear_output(wait=True)

<h5><b>Before running the following steps of the code, be sure that the previous one has been correctly performed</b></h5>

In [None]:
all_tweets = last_100_no_vax_tweets_cleaned.find({"has_text_processed": {"$eq": False}}, no_cursor_timeout = True)
all_tweets_count = last_100_no_vax_tweets_cleaned.count_documents({"has_text_processed": False})

In [None]:
# remove all the tweets that couldn't be processed
tweet_processed_count = 0
for tweet in all_tweets:
    last_100_no_vax_tweets_cleaned.delete_one({"id": tweet["id"]})
    
    tweet_processed_count += 1
    print("Processed ({:.2f} %) -".format(float(tweet_processed_count)/float(all_tweets_count) * 100), tweet_processed_count,"/", all_tweets_count)
    clear_output(wait=True)   

<h5><b>-----------------------------------------------------------------------------------------------------------------------------</b></h5>

In [None]:
nlp = spacy.load("it_core_news_lg")

In [None]:
def reset_has_text_processed(tweet_id, main_collection):
    query = {"id": tweet_id}
    
    new_values = {
        "$set": {
            "has_text_processed": False
        }
    }
    
    main_collection.update_one(query, new_values)

In [None]:
to_be_reset_tweets = last_100_no_vax_tweets_cleaned.find({})
to_be_reset_tweets_count = last_100_no_vax_tweets_cleaned.count_documents({})

In [None]:
# reset the text processed field
processed_tweets = 0
for tweet in to_be_reset_tweets:
    reset_has_text_processed(tweet['id'], new_tweets_cleaned_collection)
    
    processed_tweets += 1
    print("Processed ({:.2f} %) -".format(float(processed_tweets)/float(to_be_reset_tweets_count) * 100), processed_tweets,"/", to_be_reset_tweets_count)
    clear_output(wait=True)

In [None]:
def set_useful_tokens_for_ta(tweet_id, main_collection, tokens_for_topic_analysis_no_verbs, tokens_for_topic_analysis):
    query = {"id": tweet_id}
    
    new_values = {
        "$set": {
            "tokens_for_topic_analysis_no_verbs": list(set(tokens_for_topic_analysis_no_verbs)),
            "tokens_for_topic_analysis": list(set(tokens_for_topic_analysis)),
            "has_text_processed": True
        }
    }
    
    main_collection.update_one(query, new_values)

In [None]:
all_tweets = last_100_no_vax_tweets_cleaned.find({"has_text_processed": {"$eq": False}})
all_tweets_count = last_100_no_vax_tweets_cleaned.count_documents({"has_text_processed": {"$eq": False}})
excluded_words = ['cose', 'cosa', 'merda', 'cazzo', 'coglioni', 'culo', '\u200d️']

In [None]:
# extract the words for the topic analysis
processed_tweets = 0
for tweet in all_tweets:
    token_array = []
    token_w_verbs_array = []
    doc = nlp(tweet['extracted_clean_tweet_no_hashtags'])
    
    for token in doc:
        if len(token.text) > 1:
            if token.lemma_ not in excluded_words:
                if token.pos_ == "PROPN" or token.pos_ == "NOUN":
                    token_array.append(token.text)
                    token_w_verbs_array.append(token.text)

                if token.pos_ == "VERB" and token.lemma_ != "essere" and token.lemma_ != "avere":
                    token_w_verbs_array.append(token.lemma_)
    
    set_useful_tokens_for_ta(tweet['id'], last_100_no_vax_tweets_cleaned, token_array, token_w_verbs_array)
    
    processed_tweets += 1
    print("Processed ({:.2f} %) -".format(float(processed_tweets)/float(all_tweets_count) * 100), processed_tweets,"/", all_tweets_count)
    clear_output(wait=True)

<h3>6 - Topic analysis</h3>

In [None]:
tweets_for_topic_analysis = last_100_no_vax_tweets_cleaned.find({"has_text_processed": {"$eq": True}}, no_cursor_timeout = True) 
tweets_for_topic_analysis_count = last_100_no_vax_tweets_cleaned.count_documents({"has_text_processed": {"$eq": True}})

In [None]:
texts = []
processed_tweets = 0

# collect the tweets with at least two tokens and whose language is italian
for tweet in tweets_for_topic_analysis:
    if len(tweet['tokens_for_topic_analysis_no_verbs']) > 2 and tweet['lang'] == "it":
        texts.append(tweet['tokens_for_topic_analysis_no_verbs'])
    
    processed_tweets += 1
    print("Processed ({:.2f} %) -".format(float(processed_tweets)/float(tweets_for_topic_analysis_count) * 100), processed_tweets,"/", tweets_for_topic_analysis_count)
    clear_output(wait=True)

In [None]:
# prepare the corpus
id2word = corpora.Dictionary(texts)
corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
# topic analysis with varying number of topics (from lower_range to upper_range)
lower_range = 3
upper_range = 30
step = 1
coherence_values = []
model_list = []

for n_t in range(lower_range, upper_range, step):
    print("Topics", n_t)
    model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=n_t,
                                       random_state=1)
    model_list.append(model)
    coherencemodel = gensim.models.CoherenceModel(model=model, texts=texts, coherence='c_v')
    coherence_values.append(coherencemodel.get_coherence())
    clear_output(wait=True)

In [None]:
# plom the coherence scores
x = range(lower_range, upper_range, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
folder = "C:/Users/Andrea/Desktop/Vaccinitaly Analysis/models/all_lda_3_to_30/"
file_name = "lda_"
extension = ".model"

In [None]:
# store all the coherence scores and the models
i = 3
for lda_model in model_list:
    temp_file = datapath(folder + file_name + str(i) + extension)
    lda_model.save(temp_file)
    i += 1

f = open(folder + "coherence_scores_3_to_30.txt", "w")
for c_v in coherence_values:
    f.write(str(c_v) + "\n")

f.close()

<h3>6.1 - Best model in Topic analysis</h3>

<h5><b>Before running the following steps of the code configure them depending on the previous results</b></h5>

In [None]:
pyLDAvis.enable_notebook()

In [None]:
# setup the model to consider
n_topic = 16

# setup the number of tweets to display to help with topic naming
MAX_TWEETS = 5

# setup the number of words to print for each topic
n_words_per_topic = 15

In [None]:
# get the model with the desired amount of topics
model_path = datapath(folder + file_name + n_topic + extension)
best_model = LdaModel.load(model_path)

x = best_model.show_topics(num_topics=n_topic, num_words=n_words_per_topic, formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

# print the top n_words_per_topic words
for topic in topics_words:
    print(topic[0], topic[1])

In [None]:
# reload the data if needed
if id2word is None or corpus is None:
    tweets_for_topic_analysis = last_100_no_vax_tweets_cleaned.find({"has_text_processed": {"$eq": True}}, no_cursor_timeout = True) 
    tweets_for_topic_analysis_count = last_100_no_vax_tweets_cleaned.count_documents({"has_text_processed": {"$eq": True}})

    id2word = corpora.Dictionary(texts)
    corpus = [id2word.doc2bow(text) for text in texts]

    texts = []
    processed_tweets = 0

    # collect the tweets with at least two tokens and whose language is italian
    for tweet in tweets_for_topic_analysis:
        if len(tweet['tokens_for_topic_analysis_no_verbs']) > 2 and tweet['lang'] == "it":
            texts.append(tweet['tokens_for_topic_analysis_no_verbs'])

        processed_tweets += 1
        print("Processed ({:.2f} %) -".format(float(processed_tweets)/float(tweets_for_topic_analysis_count) * 100), processed_tweets,"/", tweets_for_topic_analysis_count)
        clear_output(wait=True)

    id2word = corpora.Dictionary(texts)
    corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
# display the topics on graph
vis_data = gensimvis.prepare(best_model, corpus, id2word)
pyLDAvis.display(vis_data)

In [None]:
def update_topic(tweet_id, main_collection, topic):
    query = {"id": tweet_id}
    
    new_values = {
        "$set": {
            "topic": topic
        }
    }
    
    main_collection.update_one(query, new_values)

In [None]:
tweets_wo_topic = last_100_no_vax_tweets_cleaned.find({}, no_cursor_timeout = True) 
tweets_wo_topic_count = last_100_no_vax_tweets_cleaned.count_documents({})

In [None]:
# 
processed_tweets = 0
for tw_t in tweets_wo_topic:
    if len(tw_t['tokens_for_topic_analysis_no_verbs']) > 0 and tweet['lang'] == "it":
        tw_t_id = id2word.doc2bow(tw_t['tokens_for_topic_analysis_no_verbs'])
        doc_topics = best_model.get_document_topics(tw_t_id)
        
        equals = True
        if len(doc_topics) > 1:
            basic_topic_perc = doc_topics[0][1]
            for item in doc_topics:
                if basic_topic_perc != item[1]:
                    equals = False
                    break
        if not equals:
            val = max(doc_topics, key=itemgetter(1))
            update_topic(tw_t['id'], new_tweets_cleaned_collection, [val[0], float(val[1])])
        else:
            update_topic(tw_t['id'], new_tweets_cleaned_collection, ["Undefined", "Undefined"])
    else:
        update_topic(tw_t['id'], new_tweets_cleaned_collection, ["Undefined", "Undefined"])

    processed_tweets += 1
    print("Processed ({:.2f} %) -".format(float(processed_tweets)/float(tweets_wo_topic_count) * 100), processed_tweets,"/", tweets_wo_topic_count)
    clear_output(wait=True)

In [None]:
tweets_collection_topic = last_100_no_vax_tweets_cleaned.find({"topic.0": {"$ne": "Undefined"}}, no_cursor_timeout = True)
tweets_collection_topic_count = last_100_no_vax_tweets_cleaned.count_documents({"topic.0": {"$ne": "Undefined"}})

In [None]:
# find the most representative tweets per topic
top_elements = []
processed_tweets = 0

for i in range(0, n_topic):
    top_elements.append([])

# find the top MAX_TWEETS representative tweets for each topic
for tw_top in tweets_collection_topic:
    # add if italian and no dups
    if tw_top["lang"] == "it" and tw_top["tokens_for_topic_analysis_no_verbs"] not in [el[1] for el in top_elements[tw_top["topic"][0]]]:
        top_elements[tw_top["topic"][0]].append([tw_top["topic"], tw_top["extracted_clean_tweet_no_hashtags"]])

        if len(top_elements[tw_top["topic"][0]]) > MAX_TWEETS:
            tmp_top_el = top_elements[tw_top["topic"][0]]
            # sort
            tmp_top_el = sorted(tmp_top_el, key=lambda el: el[0][1], reverse=True)
            top_elements[tw_top["topic"][0]] = tmp_top_el[:-1]
    
    processed_tweets += 1
    print("Processed ({:.2f} %) -".format(float(processed_tweets)/float(tweets_collection_topic_count) * 100), processed_tweets,"/", tweets_collection_topic_count)
    clear_output(wait=True)

In [None]:
# print the most representing tweets to help with the topic detection
for el in top_elements:
    print(el)

<h1>Outcomes</h1>
<h4>Topic 0 - Vaccini e Società</h4>
<b>Words: </b><span>['persone', 'anni', 'scuola', 'ragazzi', 'vita', 'casa', 'servizio', 'euro']</span>
<h4>Topic 1 - Felicissima Sera - Pio e Amedeo (Aprile 2021)</h4>
<b>Words: </b><span>['bisogno', 'faccia', 'pio', 'amedeo', 'anni', 'francesco', 'casa', 'mondo', 'pensiero']</span>
<h4>Topic 2 - Coronavirus</h4>
<b>Words: </b><span>['maggio', 'covid', 'aprile', 'italia', 'morti', 'ore', 'giorno', 'libertà', 'dati', 'casi']</span>
<h4>Topic 3 - Fedez Contro Rai</h4>
<b>Words: </b><span>['rai', 'sera', 'fedez', 'sistema', 'censura', 'nomi', 'palco', 'telefonata', 'tempo', 'canale']</span>
<h4>Topic 4 - Scandalo Grillo</h4>
<b>Words: </b><span>['figlio', 'grillo', 'gente', 'conte', 'ragazza', 'anni', 'padre', 'parola', 'draghi']</span>
<h4>Topic 5 - Altro</h4>
<b>Words: </b><span>['giorno', 'mondo', 'mamma', 'settimana', 'vaccino', 'mare', 'compleanno', 'anno', 'raga', 'anni']</span>
<h4>Topic 6 - Repubblica e Società</h4>
<b>Words: </b><span>['presidente', 'anni', 'prof', 'repubblica', 'solidarietà', 'bocca', 'piacere', 'profilo']</span>
<h4>Topic 7 - Lega Salvini - Destra</h4>
<b>Words: </b><span>['salvini', 'piazza', 'milano', 'forza', 'lega', 'canzone', 'claudio', 'meloni', 'italia']</span>
<h4>Topic 8 - Conflitto Israele</h4>
<b>Words: </b><span>['israele', 'palestinesi', 'articolo', 'guerra', 'bambini', 'parte', 'gaza', 'pace', 'giornalisti']</span>
<h4>Topic 9 - Calcio Italiano</h4>
<b>Words: </b><span>['punto', 'juve', 'squadra', 'anni', 'partita', 'milan', 'calcio', 'champions', 'napoli', 'gol']</span>
<h4>Topic 10 - Legge DDL Zan</h4>
<b>Words: </b><span>['persona', 'ddl', 'soldi', 'legge', 'zan', 'speranza', 'parte', 'anni', 'ministro']</span>
<h4>Topic 11 - Politica</h4>
<b>Words: </b><span>['testa', 'governo', 'anni', 'draghi', 'giulia', 'giorni', 'processo', 'paese', 'consiglio', 'salvini']</span>
<h4>Topic 12 - Elezioni Sindaco Roma</h4>
<b>Words: </b><span>['roma', 'pd', 'programma', 'raggi', 'sindaco', 'lavoro', 'anni', 'maestro', 'm5s', 'lavoratori']</span>
<h4>Topic 13 - Renzi - Italiaviva</h4>
<b>Words: </b><span>['renzi', 'diretta', 'servizi', 'minuti', 'matteo', 'video', 'incontro', 'anno', 'report', 'attività']</span>
<h4>Topic 14 - Festa della Donna</h4>
<b>Words: </b><span>['grazie', 'giornata', 'vita', 'foto', 'donna', 'casa', 'anni', 'parte', 'cuore', 'genere']</span>
<h4>Topic 15 - Attività Sociali - Riaperture</h4>
<b>Words: </b><span>['anni', 'calcio', 'italia', 'libro', 'maggio', 'concerto', 'anno', 'club']</span>