# Libraries

In [1]:
!pip install -U feel-it

Collecting feel-it
  Downloading feel_it-1.0.3-py2.py3-none-any.whl (5.8 kB)
Collecting transformers==4.3.3
  Downloading transformers-4.3.3-py3-none-any.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 13.4 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 41.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 73.1 MB/s 
Installing collected packages: tokenizers, sacremoses, transformers, feel-it
Successfully installed feel-it-1.0.3 sacremoses-0.0.47 tokenizers-0.10.3 transformers-4.3.3


In [2]:
!pip install tweepy --upgrade

In [3]:
import tweepy
import pandas as pd
import re
import string
import itertools
from collections import Counter
from datetime import datetime
import numpy as np
from numpy.core.multiarray import result_type
from matplotlib import pyplot as plt

from feel_it import SentimentClassifier

# Nltk Packages
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

import time
tweepy.__version__ # need verion 4.5.0

'3.10.0'

In [4]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [6]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


# Functions

In [7]:
def token_to_phrase(token_list):
    phrase_list = []
    
    for phrase in token_list:
        seq = ""
        for word in phrase:
            seq = seq + " " + word                                              #-- Add a Blankspace
        phrase_list.append(seq)                                                 #-- Attach each Token in a Phrase
    return phrase_list

In [8]:
def get_counter(df, lower = True):
  sentences = (list(itertools.chain(df)))
  if lower == True:
    flat_list = [item.lower() for sublist in sentences for item in sublist]
  else:
    flat_list = [item for sublist in sentences for item in sublist]
  count = Counter(flat_list)
  return count

In [9]:
def preprocessing(doc_text, count_lower = True):

  retweet = []
  tokenized = []                                                                #-- List for tokenization                                    

  tokenizer = TweetTokenizer()                                                  #-- Tokenizer

  for phrase in doc_text:

    if phrase[0:3] == 'RT ':                                                    #-- 1 se retweet, 0 altrimenti
      retweet.append(1)
    else:
      retweet.append(0)

    phrase = re.sub(r'<[^>]+>','', phrase)                                      #-- Remove HTML tags
    phrase = re.sub(r'(?:@[\w_]+)','', phrase)                                  #-- Remove mentions
    # phrase = re.sub(r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)",'', phrase)              #-- Remove hashtags
    phrase = re.sub(r"http\S+", '', phrase)                                     #-- Remove URLs

    phrase = phrase.translate(str.maketrans('', '', string.punctuation))        #-- Remove punctuation
    phrase = re.sub("\S*\d\S*", "", phrase).strip()                             #-- Remove words with number
    phrase = re.sub(r'\d+', '', phrase)                                         #-- Remove blankspace

    phrase = tokenizer.tokenize(phrase)                                         #-- Tokenization
    temp = []
    for token in phrase:
      if token.lower() not in stop:
        temp.append(token)

    tokenized.append(temp)                                                      #-- Add to Tokenization List

  count = get_counter(tokenized, lower = count_lower)                           #-- Token counter

  phrase_list = token_to_phrase(tokenized)                                      #-- From list of token to phrase



  return phrase_list, retweet, count

In [10]:
sentiment_classifier = SentimentClassifier()

Downloading:   0%|          | 0.00/847 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/794k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.68M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/299 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/414 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/443M [00:00<?, ?B/s]

In [11]:
def sentiment_BERT(text, rest = 5000):
  # Per informazioni sul classificatore:
  # https://towardsdatascience.com/sentiment-analysis-and-emotion-recognition-in-italian-using-bert-92f5c8fe8a2

  li_sent = []
  for i in range(0, text.shape[0]):
    sent = sentiment_classifier.predict([text[i]])
    li_sent.append(sent)
    
    if i % int(rest) == 0:
      print('Riga',i,'su',text.shape[0])
  return [item for sublist in li_sent for item in sublist]

In [12]:
def sentiment_VADER(text):
  # Per info sul classificatore:
  # https://pypi.org/project/vader-multi/
  analyzer = SentimentIntensityAnalyzer()

  neg = []
  neu = []
  pos = []
  compound = []
  giudizio = []

  for phrase in text:
    diz = analyzer.polarity_scores(phrase)
    
    if diz['compound'] >= 0.05 :
      giud = "Positive"
    elif diz['compound'] <= - 0.05 :
      giud = "Negative"
    else:
      giud = "Neutral"
    
    neg.append(diz['neg'])
    neu.append(diz['neu'])
    pos.append(diz['pos'])
    compound.append(diz['compound'])
    giudizio.append(giud)
    
  return neg, neu, pos, compound, giudizio

# Download Tweets

In [None]:
consumer_key = ""
consumer_secret = ""
access_token = ""
access_token_secret = ""

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

In [None]:
hash = '("#Quirinale2022" OR "#Quirinale" OR "#PdR" OR "#PresidenzaDellaRepubblica" OR "#PresidenteDellaRepubblica")'

df_tp = pd.DataFrame([1488117486016860161], columns = ['id'])

giri = 5

list_tweet = []
for count in range(1,giri+1):
  print('Inizio', count, 'giro su', pd.Series(range(1,giri+1)).max())
  for tweet in tweepy.Cursor(api.search_tweets,
                             q=hash,
                             count=50,
                             until='2022-02-01',                                #-- DA MODIFICARE OGNI VOLTA
                             max_id = str(df_tp['id'].min()),                   #-- DA MODIFICARE OGNI VOLTA
                             lang='it').items(850):

    full_text = api.get_status(tweet.id, tweet_mode='extended')._json['full_text']

    list_tweet.append([tweet.created_at,                                        #-- data
                       tweet.id,                                                #-- id tweet
                       full_text,                                               #-- testo
                       tweet.favorite_count,                                    #-- numero favorite
                       tweet.retweet_count,                                     #-- numero retweet
                       tweet.user.screen_name,                                  #-- nome utente
                       tweet.entities['user_mentions'],                         #-- menzioni
                       tweet.entities['hashtags']])                             #-- hashtags contenuti

    df_tp = pd.DataFrame(list_tweet,columns=['date','id','text','like','n_rt','author','mentions', 'hashtags'])
    df_tp.to_csv('/content/gdrive/MyDrive/SMA_project/tweets_downloaded.csv', index = False)

  print('Fine  ', count, 'giro su', pd.Series(range(1,giri+1)).max())

  if not count == int(giri):
    time.sleep(60)                                                              #-- 1  min di pausa
    time.sleep(60)                                                              #-- 2  min di pausa
    time.sleep(60)                                                              #-- 3  min di pausa
    time.sleep(60)                                                              #-- 4  min di pausa
    time.sleep(60)                                                              #-- 5  min di pausa
    time.sleep(60)                                                              #-- 6  min di pausa
    time.sleep(60)                                                              #-- 7  min di pausa
    time.sleep(60)                                                              #-- 8  min di pausa
    time.sleep(60)                                                              #-- 9  min di pausa
    time.sleep(60)                                                              #-- 10 min di pausa
    time.sleep(60)                                                              #-- 11 min di pausa
    time.sleep(60)                                                              #-- 12 min di pausa
    time.sleep(60)                                                              #-- 13 min di pausa
    time.sleep(60)                                                              #-- 14 min di pausa
    time.sleep(60)                                                              #-- 15 min di pausa

Inizio 1 giro su 5
Fine   1 giro su 5
Inizio 2 giro su 5
Fine   2 giro su 5
Inizio 3 giro su 5
Fine   3 giro su 5
Inizio 4 giro su 5
Fine   4 giro su 5
Inizio 5 giro su 5
Fine   5 giro su 5


# Saving results & Import data

Downloaded

In [None]:
# tweets_downloaded = pd.read_csv('/content/gdrive/MyDrive/SMA_project/tweets_downloaded.csv', index_col = False, parse_dates = ['date'])
# tweets_downloaded.tail(5)

Temporary

In [None]:
# tweets_temp = pd.DataFrame(list_tweet,columns=['date','id','text','like','n_rt','author','mentions', 'hashtags'])
# tweets_temp.tail(5)

In [13]:
# Import old data
tweets = pd.read_csv('/content/gdrive/MyDrive/SMA_project/tweets.csv', index_col = False, parse_dates = ['date'])

# Append new data
# tweets = tweets.append(tweets_downloaded.copy(), ignore_index=True).copy()
# tweets = tweets.append(tweets_temp.copy(), ignore_index=True).copy()

# Remove duplicates
print("Pre eliminazione duplicati:  ", tweets.shape[0], "tweet")
tweets.drop_duplicates(subset ="id", inplace = True)
tweets.reset_index(drop = True, inplace = True)
print("Post eliminazione duplicati: ", tweets.shape[0], "tweet")

# Save new data
tweets.to_csv('/content/gdrive/MyDrive/SMA_project/tweets.csv', index = False)

Pre eliminazione duplicati:   94955 tweet
Post eliminazione duplicati:  94955 tweet


In [14]:
tweets

Unnamed: 0,date,id,text,like,n_rt,author,mentions,hashtags
0,2022-01-26 16:30:13+00:00,1486375935834210307,#Quirinale2022\n💥\nAndrà che ci sarà #Mattarel...,0,0,LuigiPa70,[],
1,2022-01-26 16:29:39+00:00,1486375792129064964,"se le premesse sono queste, mi stupisce molto ...",0,0,Fiodor1976,[],
2,2022-01-26 16:28:28+00:00,1486375496996823046,Non mi turba ci si metta tanto a trovare un no...,1,0,RobbieGalante,[],
3,2022-01-26 16:27:04+00:00,1486375144343969797,#Quirinale2022 #PresidenteDellaRepubblica mi s...,0,0,ValeriaSirigu,[],
4,2022-01-26 16:26:57+00:00,1486375114857926656,Ma se quel povero #Mattarella ha già detto che...,0,0,brislacciata84,[],
...,...,...,...,...,...,...,...,...
94950,2022-01-31 00:01:26+00:00,1487939041639251970,RT @EugenioCardi: Io dico una sola cosa: per f...,0,22,RivettaElena,"[{'screen_name': 'EugenioCardi', 'name': 'Euge...","[{'text': 'Berlusconi', 'indices': [66, 77]}, ..."
94951,2022-01-31 00:01:26+00:00,1487939039550492673,RT @andreapurgatori: Una settimana di alta pol...,0,76,MaxPaplar,"[{'screen_name': 'andreapurgatori', 'name': 'a...","[{'text': 'Quirinale2022', 'indices': [53, 67]..."
94952,2022-01-30 21:46:07+00:00,1487904988080787460,RT @AnkyBoh: ELETTORI DELLA #LEGA...SIETE INCA...,0,38,SamGibili1,"[{'screen_name': 'AnkyBoh', 'name': 'LisaDaCag...","[{'text': 'LEGA', 'indices': [28, 33]}, {'text..."
94953,2022-01-30 21:12:36+00:00,1487896551007698944,RT @fratotolo2: “La folla… ma neanche tanto”@f...,0,130,SamGibili1,"[{'screen_name': 'fratotolo2', 'name': 'France...","[{'text': 'Quirinale', 'indices': [62, 72]}]"


# Pre-processing

## Change date format

In [None]:
# Change date format
day = tweets['date'].dt.day
month = tweets['date'].dt.month
year = tweets['date'].dt.year

date = year.astype(str) + month.astype(str).str.zfill(2) + day.astype(str).str.zfill(2)
date = pd.to_datetime(date, format='%Y%m%d')
tweets.drop(columns = ['date'], inplace = True)
tweets['date'] = date

# Reorder columns
cols = tweets.columns.tolist()
cols = cols[-1:] + cols[:-1]
tweets = tweets[cols].copy()
print('Tweet per day:')
print()
print(tweets.groupby('date').count()['id'])
# print()
# print()
# print('Minimum Tweet ID per day:')
# print()
# print(tweets.groupby('date').min('id')['id'])
# print()
# print()
# print('Maximum Tweet ID per day:')
# print()
# print(tweets.groupby('date').max('id')['id'])

Tweet per day:

date
2022-01-21        2
2022-01-22        9
2022-01-23      706
2022-01-24     7883
2022-01-25    12052
2022-01-26     5177
2022-01-27    14860
2022-01-28    12718
2022-01-29    25293
2022-01-30    10314
2022-01-31     5941
Name: id, dtype: int64


## Text pre-processing

In [None]:
stop = stopwords.words('italian')

stop = set(stop)

stop.add("...")
stop.add("..")
stop.add('’')
stop.add('“')
stop.add('”')
stop.add('')
stop.add('️')
stop.add('🏻')
stop.add('🇹')
stop.add('così')
stop.add('però')
stop.add('già')
stop.add('°')
stop.add('poi')
stop.add('dopo')
stop.add('può')
stop.add('comunque')
stop.add('quirinale')
stop.add('presidentedellarepubblica')
stop.add('presidenzadellarepubblica')
stop.add('pdr')
stop.add('elezioniquirinale')
stop.add('maratonamentana')
stop.add('maratonaquirinale')
stop.add('elezionipresidentedellarepubblica')
stop.add('presidente')
stop.add('…')
stop.add('rt')

stop = list(stop)

In [None]:
tweets['text clean'], tweets['RT'], count = preprocessing(tweets['text'], count_lower = True)

In [None]:
tweets.groupby('RT').count()

Unnamed: 0_level_0,date,id,text,like,n_rt,author,mentions,hashtags,text clean
RT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,52410,52410,52410,52410,52410,52410,52410,51525,52410
1,42545,42545,42545,42545,42545,42545,42545,42545,42545


In [None]:
tweets.groupby('RT').count()

Unnamed: 0_level_0,date,id,text,like,n_rt,author,mentions,hashtags,text clean
RT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,52410,52410,52410,52410,52410,52410,52410,51525,52410
1,42545,42545,42545,42545,42545,42545,42545,42545,42545


In [None]:
count.most_common(30)

[('mattarella', 26271),
 ('draghi', 7393),
 ('repubblica', 7200),
 ('salvini', 5930),
 ('belloni', 5686),
 ('politica', 4702),
 ('oggi', 4168),
 ('stato', 4148),
 ('sergio', 4098),
 ('renzi', 3665),
 ('mattarellabis', 3551),
 ('solo', 3379),
 ('letta', 3376),
 ('nome', 3358),
 ('casellati', 3307),
 ('voti', 3303),
 ('anni', 3225),
 ('essere', 3197),
 ('fatto', 2936),
 ('conte', 2907),
 ('nomi', 2869),
 ('centrodestra', 2771),
 ('fare', 2683),
 ('no', 2604),
 ('ancora', 2601),
 ('paese', 2553),
 ('fa', 2498),
 ('casini', 2439),
 ('cosa', 2431),
 ('donna', 2429)]

In [None]:
tweets[['text','text clean']]

Unnamed: 0,text,text clean
0,#Quirinale2022\n💥\nAndrà che ci sarà #Mattarel...,💥 Andrà Mattarellabis parziale cioè sino fine...
1,"se le premesse sono queste, mi stupisce molto ...",premesse stupisce molto fatto ancora resi con...
2,Non mi turba ci si metta tanto a trovare un no...,turba metta tanto trovare nome condiviso tant...
3,#Quirinale2022 #PresidenteDellaRepubblica mi s...,sa qualcuno contando voti qualcun altro dovrà...
4,Ma se quel povero #Mattarella ha già detto che...,quel povero Mattarella detto intende ricandid...
...,...,...
94950,RT @EugenioCardi: Io dico una sola cosa: per f...,dico sola cosa fortuna Berlusconi diventato P...
94951,RT @andreapurgatori: Una settimana di alta pol...,settimana alta politica PresidenzaDellaRepubblic
94952,RT @AnkyBoh: ELETTORI DELLA #LEGA...SIETE INCA...,ELETTORI LEGASIETE INCAZZATI LEGA Salvini
94953,RT @fratotolo2: “La folla… ma neanche tanto”@f...,folla neanche tanto


# Sentiment Analysis

## Preparazione dataset

In [None]:
df_sentiment = tweets[(tweets['date'] != '2022-01-21') &  (tweets['date'] != '2022-01-22')].reset_index(drop = True)
df_sentiment = df_sentiment[['date','text clean']]

Prima della rimozione dei retweet:	 94955 righe
Dopo della rimozione dei retweet:	 52399 righe


## BERT

In [None]:
text = df_sentiment['text clean']
li_sent = []
for i in range(0, text.shape[0]):
  sent = sentiment_classifier.predict([text[i]])
  li_sent.append(sent)
  if i % 5000 == 0:
    print('Riga',i,'su',text.shape[0])

df_sentiment['sentiment_BERT'] = [item for sublist in li_sent for item in sublist]

In [None]:
positive = []
negative = []
ratio = []
for line in df_sentiment.values:
  sent = line[2]

  if sent == 'negative':
    positive.append(0)
    negative.append(1)
    ratio.append(-1)
  else:
    positive.append(1)
    negative.append(0)
    ratio.append(1)

In [None]:
df_sentiment['positive'] = positive
df_sentiment['negative'] = negative
df_sentiment['ratio'] = ratio

In [None]:
df_sentiment.to_csv('/content/gdrive/MyDrive/SMA_project/tweets_sentiment_total.csv', index = False)

In [None]:
df_sentiment = pd.read_csv('/content/gdrive/MyDrive/SMA_project/tweets_sentiment_total.csv')

## For community

In [None]:
df_community = pd.read_csv('/content/gdrive/MyDrive/SMA_project/out_table.csv')

In [None]:
grouped = df_community.groupby('modularity_class').count().reset_index().sort_values('Id', ascending=False)[['modularity_class','Id']].reset_index(drop = True)

Unnamed: 0,modularity_class,Id
0,32,7077
1,62,4523
2,9,4519
3,5,4448
4,17,2117
5,22,1194


In [None]:
subdf_community_1 = tweets[tweets['author'].isin(df_community[df_community['modularity_class'] == 32]['Id'])].reset_index(drop = True)
subdf_community_2 = tweets[tweets['author'].isin(df_community[df_community['modularity_class'] == 62]['Id'])].reset_index(drop = True)
subdf_community_3 = tweets[tweets['author'].isin(df_community[df_community['modularity_class'] ==  9]['Id'])].reset_index(drop = True)
subdf_community_4 = tweets[tweets['author'].isin(df_community[df_community['modularity_class'] ==  5]['Id'])].reset_index(drop = True)
subdf_community_5 = tweets[tweets['author'].isin(df_community[df_community['modularity_class'] == 17]['Id'])].reset_index(drop = True)
subdf_community_6 = tweets[tweets['author'].isin(df_community[df_community['modularity_class'] == 22]['Id'])].reset_index(drop = True)

sentiment_1 = sentiment_BERT(subdf_community_1['text clean'])
subdf_community_1['sentiment'] = sentiment_1
subdf_community_1['community'] = 1
subdf_community_1 = subdf_community_1[['date','author','text clean','sentiment','community']]

sentiment_2 = sentiment_BERT(subdf_community_2['text clean'])
subdf_community_2['sentiment'] = sentiment_2
subdf_community_2['community'] = 2
subdf_community_2 = subdf_community_2[['date','author','text clean','sentiment','community']]

sentiment_3 = sentiment_BERT(subdf_community_3['text clean'])
subdf_community_3['sentiment'] = sentiment_3
subdf_community_3['community'] = 3
subdf_community_3 = subdf_community_3[['date','author','text clean','sentiment','community']]

sentiment_4 = sentiment_BERT(subdf_community_4['text clean'])
subdf_community_4['sentiment'] = sentiment_4
subdf_community_4['community'] = 4
subdf_community_4 = subdf_community_4[['date','author','text clean','sentiment','community']]

sentiment_5 = sentiment_BERT(subdf_community_5['text clean'])
subdf_community_5['sentiment'] = sentiment_5
subdf_community_5['community'] = 5
subdf_community_5 = subdf_community_5[['date','author','text clean','sentiment','community']]

sentiment_6 = sentiment_BERT(subdf_community_6['text clean'])
subdf_community_6['sentiment'] = sentiment_6
subdf_community_6['community'] = 6
subdf_community_6 = subdf_community_6[['date','author','text clean','sentiment','community']]

subdf_community = subdf_community_1.append(subdf_community_2).append(subdf_community_3).append(subdf_community_4).append(subdf_community_5).append(subdf_community_6)

In [None]:
subdf_community.to_csv('/content/gdrive/MyDrive/SMA_project/subdf_community.csv', index = False)

In [None]:
subdf_community = pd.read_csv('/content/gdrive/MyDrive/SMA_project/subdf_community.csv')

# Community detection

In [None]:
li_ment = []
for line in enumerate(tweets['mentions']):
  li = re.findall("\'screen_name\': \'([a-zA-Z0-9_.+-]+)\'", str(line[1]))

  if tweets['RT'][line[0]] == 1:
    if li:
      li_ment.append([li[0]])
    else:
      li_ment.append(li)
  else:
    li_ment.append(li)

df =  pd.DataFrame({'author' : tweets['author'],
                    'mentions' : li_ment,
                    'RT' : tweets['RT']})

li_edges = []
for line in df.values:
  if line[1]:
    for ment in line[1]:
      edge = (line[0], ment)
      li_edges.append(edge)

In [None]:
pd.DataFrame(li_edges).to_csv('/content/gdrive/MyDrive/SMA_project/community.csv',
                index = False,
                header = False)

In [None]:
# li_mentioned = []
# for line in li_ment:
#   for ment in line:
#     if ment:
#       li_mentioned.append(ment)

# li_auth = []
# for line in li_edges:
#   li_auth.append(line[0])