## Define the Objective of the Analysis

...

Analisi del sentiment relativo al dibattito sullo _Smart Working_ in Italia e individuazione di eventuali 'influencer' sul tema.

## Collect Data

In [1]:
# imports
import numpy as np
import pandas as pd
import tweepy
tweepy.__version__

import re
import string
import itertools
from collections import Counter
from datetime import datetime
from numpy.core.multiarray import result_type
import time

In [2]:
# insert the keys here
consumer_key = 'VPzjkqKl2y1uSTJQvnVqS9e1X' 
consumer_secret = 'STG2IzVMf65vPGeOvBQyzdeoKBExAr5sIkhOaBeDe2fnIN14vY'
access_token = '1508409949835214853-HIyZJ3oT32TijKsdNDhGFZEEQTWwau'
access_token_secret = 'uLcs9hUYmLdocxkaSfXo69Gii46TISu5qZj5F6f6fBfnW'

#### Tweets Download

The next step is creating an OAuthHandler instance. We pass our consumer key and access token which we defined above.

In [3]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

Next, we pass the OAuthHandler instance into the API method.

In [4]:
api = tweepy.API(auth, wait_on_rate_limit=True)

Tweets that contain a specific hashtag

In [None]:
from tqdm.notebook import tqdm

import time
hashtag = '("#smartworking" OR "#remotework" OR "#lavoroagile")'

list_tweets = []

for tweet in tweepy.Cursor(api.search_tweets, q=hashtag, count=100, lang='it').items(10000):
  print('entering')
  full_text = api.get_status(tweet.id, tweet_mode='extended')._json['full_text']
  print(tweet.id)
  list_tweets.append([tweet.created_at, tweet.id, full_text, tweet.favorite_count, tweet.retweet_count, tweet.user.screen_name,
                      tweet.user.location, tweet.retweeted, tweet.entities['user_mentions'], tweet.entities['hashtags']])

# items is the maximum number of tweets to download.
# count is the number of tweets to return per page, up to a maximum of 100.

Tweets that contain a specific keyword

In [None]:
from tqdm.notebook import tqdm

import time
keywords = '("smartworking" OR "remotework" OR "lavoroagile")'

list_tweets = []

for tweet in tweepy.Cursor(api.search_tweets, q=keywords, count=100, lang='it').items(10000):
  print('entering')
  full_text = api.get_status(tweet.id, tweet_mode='extended')._json['full_text']
  print(tweet.id)
  list_tweets.append([tweet.created_at, tweet.id, full_text, tweet.favorite_count, tweet.retweet_count, tweet.user.screen_name,
                      tweet.user.location, tweet.retweeted, tweet.entities['user_mentions'], tweet.entities['hashtags']])

# items is the maximum number of tweets to download.
# count is the number of tweets to return per page, up to a maximum of 100.

In [None]:
print(len(list_tweets))

In [None]:
# Turn list_tweet into a DataFrame changing column names
tweets = pd.DataFrame(list_tweets, columns=['date','id','text','like','n_rt','author','location','retweeted','user_mentions','hastags'])
tweets.to_csv('../data/SW.csv')

In [None]:
# Read csv
tweets = pd.read_csv('../data/SW.csv')
#tweets_loaded = tweets_loaded.drop('Unnamed: 0', axis=1)
tweets.head()

In [None]:
print(tweets.shape)
print(tweets.columns)

#### Data Pre-Processing

In [None]:
# Remove duplicates
tweets.drop_duplicates(subset ="id", inplace = True)
tweets.reset_index(drop = True, inplace = True)
tweets.shape

In [None]:
# Change date format
day = tweets['date'].dt.day
month = tweets['date'].dt.month
year = tweets['date'].dt.year

date = year.astype(str) + month.astype(str).str.zfill(2) + day.astype(str).str.zfill(2)
date = pd.to_datetime(date, format='%Y%m%d')
tweets.drop(columns = ['date'], inplace = True)
tweets['date'] = date

# Reorder columns
cols = tweets.columns.tolist()
cols = cols[-1:] + cols[:-1]
tweets = tweets[cols].copy()

print('Tweet per day:')
print()
print(tweets.groupby('date').count()['id'])
print()
print()
print('Maximum Tweet ID per day:')
print()
print(tweets.groupby('date').max('id')['id'])

In [None]:
# Create a DataFrame with the authors of the tweets and their respective frequency
freq_authors = tweets['author'].value_counts()
freq_authors.head()

In [None]:
# Extract all hashtags from the full text
tweets['hashtags_list'] = tweets['text'].apply(lambda x: re.findall(r"#(\w+)", x))

# Extract all mentions from the full text
tweets['mentions'] = tweets['text'].apply(lambda x: re.findall(r"@(\w+)", x))

# \w matches any single letter, number or underscore (same as [a zA Z0 9_])

#### Data Cleaning

NLTK - Natural Language ToolKit is a platform for building Python programs to work with human language data. It provides easy to use interfaces to over 50 corpora and lexical resources, along with a suite of text processing libraries for classification, tokenization, stemming, tagging, parsing, and semantic reasoning.

In [None]:
import nltk
from nltk import FreqDist
nltk.download
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns

Text Pre-Processing

In [None]:
def preprocess_data(data):
    # remove numbers and turning words into lower case
    data = data.astype(str).str.replace('\d+','')
    lower_text = data.str.lower()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    w_tokenizer = TweetTokenizer()
    
    # token lemmatization (ex. goes --> go)
    def lemmatize_text(text):
        return[(lemmatizer.lemmatize(w)) for w in w_tokenizer.tokenize((text))]
    
    # remove punctuation
    def remove_punctuation(words):
        new_words = []
        for word in words:
            new_word = re.sub(r'[\w\s]', '', (word))
            if new_word != '':
                new_words.append(new_word)
        return new_words
    
    words = lower_text.apply(lemmatize_text)
    words = words.apply(remove_punctuation)
    return pd.DataFrame(words)

In [None]:
# apply preprocess_data function
pre_tweets = preprocess_data(tweets['text'])
tweets['text_proc'] = pre_tweets

# delete italian stopwords
stop_words = set(stopwords.words('italian'))
tweets['text_proc'] = tweets['text_proc'].apply(lambda x: [item for item in x if item not in stop_words])

## Analysis

### Social Content Analysis

#### _Sentiment Analysis_

When dealing with social media text, we usually want to identify urls, hashtags, smileys as separate objects and do not tokenize it to individual characters.

VADER - Eng

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download("vader_lexicon")

In [None]:
sent_analyzer = SentimentIntensityAnalyzer()

In [None]:
tweets[['text', 'text_proc']]

In [None]:
tweets['scores'] = tweets['text'].apply(lambda Tweet:sent_analyzer.polarity_scores(Tweet))
tweets['compound'] = tweets['scores'].apply(lambda score_dict:score_dict['compound'])

tweets.head()

FEEL-IT: Emotion and Sentiment Classification for the Italian Language.

https://towardsdatascience.com/sentiment-analysis-and-emotion-recognition-in-italian-using-bert-92f5c8fe8a2

In [None]:
from feel_it import EmotionClassifier, SentimentClassifier

sentiment_classifier = SentimentClassifier()
emotion_classifier = EmotionClassifier()

The feel-it-italian-sentiment model performs sentiment analysis on Italian. We fine-tuned the UmBERTo model on our new dataset (i.e., FEEL-IT) obtaining state-of-the-art performances on different benchmark corpora.

In [None]:
print(sentiment_classifier.predict(tweets["text"].values.tolist()))

In [None]:
tweets_sentiment = tweets.copy()

In [None]:
text = tweets_sentiment['text']
li_sent = []
for i in range(0, text.shape[0]):
  sent = sentiment_classifier.predict([text[i]])
  li_sent.append(sent)
  if i % 5000 == 0:
    print('Riga',i,'su',text.shape[0])

tweets_sentiment['sentiment_BERT'] = [item for sublist in li_sent for item in sublist]
     
positive = []
negative = []

In [None]:
positive = []
negative = []
ratio = []

for line in tweets_sentiment.values:
  sent = line[15]

  if sent == 'negative':
    positive.append(0)
    negative.append(1)
    ratio.append(-1)
  else:
    positive.append(1)
    negative.append(0)
    ratio.append(1)

In [None]:
tweets_sentiment['positive'] = positive
tweets_sentiment['negative'] = negative
tweets_sentiment['ratio'] = ratio

In [None]:
tweets_sentiment.head()

In [None]:
tweets_sentiment.to_csv('../data/SW_sentiment.csv')

#### _Emotion Analysis_

Recognizing emotions in text is fundamental to get a better sense of how people are talking about something. People can talk about a new event, but positive/negative labels might not be enough.

The feel-it-italian-emotion model performs emotion classification (joy, fear, anger, sadness) on Italian. We fine-tuned the UmBERTo model on our new dataset (i.e., FEEL-IT) obtaining state-of-the-art performances on different benchmark corpora.

In [None]:
print(emotion_classifier.predict(tweets["text"].values.tolist()))

In [None]:
tweets_emotion = tweets.copy()

In [None]:
text = tweets_emotion['text']
li_emotion = []
for i in range(0, text.shape[0]):
  emotion = emotion_classifier.predict([text[i]])
  li_emotion.append(emotion)
  if i % 5000 == 0:
    print('Riga',i,'su',text.shape[0])

tweets_emotion['emotion_BERT'] = [item for sublist in li_emotion for item in sublist]
     
anger = []
joy = []
fear = []
sadness = []

In [None]:
anger = []
joy = []
fear = []
sadness = []

for line in tweets_emotion.values:
  emotion = line[15]

  if emotion == 'anger':
    anger.append(1)
    joy.append(0)
    fear.append(0)
    sadness.append(0)
  elif emotion == 'joy':
    anger.append(0)
    joy.append(1)
    fear.append(0)
    sadness.append(0)
  elif emotion == 'fear':
    anger.append(0)
    joy.append(0)
    fear.append(1)
    sadness.append(0)
  else:
    anger.append(0)
    joy.append(0)
    fear.append(0)
    sadness.append(1)

In [None]:
tweets_emotion['anger'] = anger
tweets_emotion['joy'] = joy
tweets_emotion['fear'] = fear
tweets_emotion['sadness'] = sadness

In [None]:
tweets_emotion.head()

In [None]:
tweets_emotion.to_csv('../data/SW_emotion.csv')

---

#### Social Network Analysis

In [2]:
import pandas as pd


In [5]:
first = pd.read_csv('../data/SW.csv')
second = pd.read_csv('../data/SW2.csv')

dataset = pd.concat([first,second], axis=0, ignore_index=True)

In [6]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,date,id,text,like,n_rt,author,location,retweeted,user_mentions,hastags
0,0,2022-12-31 14:43:44+00:00,1609198624671469570,RT @aleale2780: @carettamc11 @FratellidItalia ...,0,11,alessiobar4,,False,"[{'screen_name': 'aleale2780', 'name': 'Aleale...","[{'text': 'opzionedonna', 'indices': [113, 126]}]"
1,1,2022-12-31 14:24:56+00:00,1609193893228150787,RT @aleale2780: @carettamc11 @FratellidItalia ...,0,11,martin_marzia,,False,"[{'screen_name': 'aleale2780', 'name': 'Aleale...","[{'text': 'opzionedonna', 'indices': [113, 126]}]"
2,2,2022-12-31 14:20:40+00:00,1609192822271270914,RT @marcoz984: Ripetete con me: lo #smartworki...,0,7,Ernesto23710724,,False,"[{'screen_name': 'marcoz984', 'name': ""marco d...","[{'text': 'smartworking', 'indices': [35, 48]}]"
3,3,2022-12-31 14:04:22+00:00,1609188720170647559,RT @IlConteIT: Per contrastare una nuova possi...,0,4,antomariateres1,,False,"[{'screen_name': 'IlConteIT', 'name': 'ConCont...","[{'text': 'Covid', 'indices': [63, 69]}, {'tex..."
4,4,2022-12-31 13:54:03+00:00,1609186120457240581,RT @dukana2: Toglietevi subito quelle mascheri...,0,11,dukana2,,False,"[{'screen_name': 'dukana2', 'name': 'dukana', ...","[{'text': 'Smartworking', 'indices': [72, 85]}..."


In [10]:
author_mentions = dataset[['author', 'user_mentions']]
author_mentions.head()


Unnamed: 0,author,user_mentions
0,alessiobar4,"[{'screen_name': 'aleale2780', 'name': 'Aleale..."
1,martin_marzia,"[{'screen_name': 'aleale2780', 'name': 'Aleale..."
2,Ernesto23710724,"[{'screen_name': 'marcoz984', 'name': ""marco d..."
3,antomariateres1,"[{'screen_name': 'IlConteIT', 'name': 'ConCont..."
4,dukana2,"[{'screen_name': 'dukana2', 'name': 'dukana', ..."


In [42]:
import networkx as nx
import json
import re
import matplotlib.pyplot as plt
Graph = nx.Graph()

for index, row in author_mentions.iterrows():
  # print(row[1])
  
  full_text = re.sub("(?<=[a-zA-Z])'(?=[a-zA-Z])", "###3", row[1])
  full_text = full_text.replace("'", '"')
  full_text = full_text.replace('###3', "'")
  mentions_array = json.loads(full_text)
  for item in mentions_array:
    print(item)
    Graph.add_edge(row[0], item['screen_name'])
nx.draw(Graph)


{'screen_name': 'aleale2780', 'name': 'Aleale', 'id': 1515566582, 'id_str': '1515566582', 'indices': [3, 14]}
{'screen_name': 'carettamc11', 'name': 'Maria Cristina Caretta', 'id': 1040271648082067458, 'id_str': '1040271648082067458', 'indices': [16, 28]}
{'screen_name': 'FratellidItalia', 'name': "Fratelli d'Italia 🇮🇹", 'id': 1024976264, 'id_str': '1024976264', 'indices': [29, 45]}
{'screen_name': 'GiorgiaMeloni', 'name': 'Giorgia Meloni', 'id': 130537001, 'id_str': '130537001', 'indices': [46, 60]}
{'screen_name': 'aleale2780', 'name': 'Aleale', 'id': 1515566582, 'id_str': '1515566582', 'indices': [3, 14]}
{'screen_name': 'carettamc11', 'name': 'Maria Cristina Caretta', 'id': 1040271648082067458, 'id_str': '1040271648082067458', 'indices': [16, 28]}
{'screen_name': 'FratellidItalia', 'name': "Fratelli d'Italia 🇮🇹", 'id': 1024976264, 'id_str': '1024976264', 'indices': [29, 45]}
{'screen_name': 'GiorgiaMeloni', 'name': 'Giorgia Meloni', 'id': 130537001, 'id_str': '130537001', 'indices'

JSONDecodeError: Invalid \escape: line 1 column 43 (char 42)

Measures of Centrality

Community Detection

## Visualization