<h1>Notebook di prova per Altair</h1>

<h2>Caricamento delle librerie e il dataset</h2>

In [1]:
#!pip install vega

<p>Import delle librerie necessarie</p>

In [3]:
import pandas as pd
import altair as alt
import altair_viewer as av
from altair import datum
import re
import string
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk.tokenize import word_tokenize

In [4]:
alt.__version__

'5.0.1'

<p>carico il dataset</p>

In [5]:
df = pd.read_csv('datasetForAltair-eng.csv', index_col=0, low_memory=False)

In [6]:
df.head(3)

Unnamed: 0,user,date,likeCount,tweetText,hashtags,lang,replies,retweetCount,replyCount,Year,Month,Day,WeekDay,hour,minutes,dayofYear,date_only,tweetText_proc,Sentiment
0,jdemay,2023-02-05 23:59:16+00:00,0,Planning to use #ChatGPT to help you with high...,['ChatGPT'],en,0.0,0.0,,2023,2,5,Sunday,23,59,36,2023-02-05,planning use help high school math ? maybe thi...,pos
1,CarlosSilvaB81,2023-02-05 23:58:51+00:00,3,"@Carmtans @stkirsch Chat gpt, is nothing but a...",,en,0.0,0.0,,2023,2,5,Sunday,23,58,36,2023-02-05,"chat gpt , nothing super search engine , progr...",neg
2,saadnajeebsaad,2023-02-05 23:58:15+00:00,0,#WeatherUpdate #Karachi #Khi #TempUpdate #Sunr...,"['WeatherUpdate', 'Karachi', 'Khi', 'TempUpdat...",en,0.0,0.0,,2023,2,5,Sunday,23,58,36,2023-02-05,sunrise : 07:12 sunset : 06:19 pm current temp...,neu


<p>stampo info relative al dataset corrente</p>

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 287309 entries, 0 to 287308
Data columns (total 19 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   user            280601 non-null  object 
 1   date            287309 non-null  object 
 2   likeCount       287309 non-null  int64  
 3   tweetText       287309 non-null  object 
 4   hashtags        177869 non-null  object 
 5   lang            287309 non-null  object 
 6   replies         281713 non-null  float64
 7   retweetCount    287202 non-null  float64
 8   replyCount      5596 non-null    float64
 9   Year            287309 non-null  int64  
 10  Month           287309 non-null  int64  
 11  Day             287309 non-null  int64  
 12  WeekDay         287309 non-null  object 
 13  hour            287309 non-null  int64  
 14  minutes         287309 non-null  int64  
 15  dayofYear       287309 non-null  int64  
 16  date_only       287309 non-null  object 
 17  tweetText_

<h2>Processing dei dati testuali</h2>

In [8]:
def pre_process(text):
    if isinstance(text, str):
        # Remove links
        text = re.sub(r'http\S+', '', text)

        # Remove mentions and hashtags
        text = re.sub(r'@\w+|#\w+', '', text)

        # Remove punctuation marks
        text = text.translate(str.maketrans('', '', string.punctuation))

        # Tokenize the text
        tokens = word_tokenize(text)

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word.lower() not in stop_words]

        # Remove empty and short words
        tokens = [word for word in tokens if len(word) > 2]

        # Join the tokens back into a single string
        cleaned_text = ' '.join(tokens)

        return cleaned_text
    else:
        return ''

In [9]:
#applico la pulizia del testo
df['tweetText_proc'] = df['tweetText_proc'].apply(pre_process)

In [12]:
df['Sentiment'].value_counts()

pos    142430
neu     95570
neg     49309
Name: Sentiment, dtype: int64

In [13]:
sentiment_mapping = {'pos': 'Positivo', 'neu': 'Neutrale', 'neg': 'Negativo'}

df['sentiment_complete'] = df['Sentiment'].map(sentiment_mapping)

In [14]:
df['sentiment_complete'].value_counts()

Positivo    142430
Neutrale     95570
Negativo     49309
Name: sentiment_complete, dtype: int64

<h2>Analisi del sentimento attraverso i mesi nei commenti di twitter</h2>

In [26]:

# Convert 'date' column to datetime
df['date'] = pd.to_datetime(df['date'])

# Group by month and sentiment
df_grouped = df.groupby([df['date'].dt.to_period('M'), 'sentiment_complete']).size().unstack()

# Convert index to string and extract month names
df_grouped.index = df_grouped.index.strftime('%B %Y')

# Reshape the dataframe for Altair
df_grouped = df_grouped.reset_index().melt('date', var_name='sentiment_complete', value_name='Value')

# Define color domain and range
domain = ['Positivo', 'Neutrale', 'Negativo']
range_ = ['#ADFC92', '#788BFF', '#F44E3F']

# Create the Altair chart
chart = alt.Chart(df_grouped, title=alt.Title(
       "Evoluzione del sentimento verso Chat GPT nell'arco dei mesi",
       subtitle="Visualizzazione dei valori relativi al dataset di Tweets"
   )).mark_line(opacity=0.1).encode(
    x=alt.X('date:T', axis=alt.Axis(title='Periodo', format='%B %Y', tickCount=12)),
    y=alt.Y('Value:Q', axis=alt.Axis(title='Numero di tweets')),
    opacity=alt.value(0.9),
    color=alt.Color('sentiment_complete',
                    title="Sentimento registrato",
                    scale=alt.Scale(domain=domain, range=range_),
                    legend=alt.Legend(orient="right"))
).configure_legend(
    orient='top'
).configure_axis(
    grid=False
).properties(
    width=800,
    height=600
)
chart
#chart.save('charttest.html')

  df_grouped = df.groupby([df['date'].dt.to_period('M'), 'sentiment_complete']).size().unstack()


In [22]:
def extract_year(value):
    return df[df['Year']==value]

y22 = extract_year(2022)
y23 = extract_year(2023)

In [23]:
print ("dimensioni dataset 2022 (righe x colonne): "+ str(y22.shape))
print ("dimensioni dataset 2023 (righe x colonne): "+ str(y23.shape))

dimensioni dataset 2022 (righe x colonne): (77338, 20)
dimensioni dataset 2023 (righe x colonne): (209971, 20)


In [24]:
#prendo il dataset, cerco le frequenze delle parole per sentimento (=risultato: tre dataframe, uno per sentimento e relativi a un anno.)
#creo un dataset con 2 colonne (uno per ogni sentimento), le colonne sono: word, frequency
#creo una visualizzazione scatter plot con colore abbinato al sentimento corrente
#mostro tutti i puntini usando un Alt che in descrizione riporta il valore label di word
#mostro una legenda con le top 20 più frequenti

In [25]:
#prendo il dataset, cerco le frequenze delle parole per sentimento.
def words_finder(df, sentiment):
    tokens = []
    df = df[(df['Sentiment'] == sentiment)]
    for row in df['tweetText_proc']:
        tokens.extend(word_tokenize(str(row).lower()))  # Tokenization and lower conversion of the strings

    word_freq = FreqDist(tokens)
    sorted_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
    banned_w = ['chatgpt', 'ai', ' ai', 'ai ', 'gpt', 'openai', 'artificialintelligence', 'amp', 'https', '2022', '2023', 'also', 'still', 'could'] #words to filter out
    sorted_freq = [(word, freq) for word, freq in sorted_freq if word not in banned_w]
    return sorted_freq #frequencies sorted for the choosen sentiment

def createDffreqs(sorted_freq): #ritorna un dataframe con una colonna contenente la parola ed una con la sua frequenza assoluta
    freq_df = pd.DataFrame(sorted_freq, columns=['word', 'frequency'])
    freq_df.sort_values(by='frequency', ascending=False, inplace=True)
    return freq_df

In [27]:
y22pos = createDffreqs(words_finder(y22, 'pos')) #dataframe commenti positivi del 2022
top22pos = y22pos.head(30)

In [28]:
y23pos = createDffreqs(words_finder(y23, 'pos')) #dataframe commenti positivi del 2023
top23pos = y23pos.head(30)

In [29]:
y22neg = createDffreqs(words_finder(y22, 'neg')) #dataframe commenti negativi del 2022
top22neg = y22neg.head(30)

In [30]:
y23neg = createDffreqs(words_finder(y23, 'neg')) #dataframe commenti negativi del 2023
top23neg = y23neg.head(30)

In [31]:
y22neu = createDffreqs(words_finder(y22, 'neu')) #dataframe commenti neutri del 2022
top22neu = y22neu.head(30)

In [32]:
y23neu = createDffreqs(words_finder(y23, 'neu')) #dataframe di commenti neutri del 2023
top23neu = y23neu.head(30)

In [33]:
# Concatenate the three DataFrames into a single DataFrame
df22words = pd.concat([top22pos, top22neg, top22neu], keys=['Positive', 'Negative', 'Neutral'])
options = ['Positive', 'Neutral', 'Negative']
labels = [option + ' ' for option in options]

# Define color domain and range
domain = ['pos', 'neu', 'neg']
range_ = ['#ADFC92', '#788BFF', '#F44E3F']

input_dropdown = alt.binding_radio(
    # Add the empty selection which shows all when clicked
    options=options + [None],
    labels=labels + ['All'],
    name='Selettore sentimento 2022: '
)

selection = alt.selection_point(
    fields=['Sentiment'],
    bind=input_dropdown,
)

# Reset the index of the concatenated DataFrame
df22words = df22words.reset_index(level=0).rename(columns={'level_0': 'Sentiment'})
alt.data_transformers.disable_max_rows()
scatter_plot_2022 = alt.Chart(df22words).mark_point(filled=True).encode(
    alt.X('word',title="Parola"),
    alt.Y('frequency', title="Frequenza").axis(labels=False),
    alt.Size('frequency'),
    alt.OpacityValue(0.7),
    alt.Color('Sentiment', title="Sentimento registrato").scale(domain=options, range=range_),
    tooltip = [alt.Tooltip('word:N'),
               alt.Tooltip('frequency:Q'),
               alt.Tooltip('frequency:Q'),
               #alt.Tooltip('Sentiment:Q')
              ]
).add_params(
    selection
).transform_filter(
    selection
).properties(
    width=600,  # Set the width of the chart to your desired value
    height=400,  # Set the height of the chart to your desired value
    title='Sentiment sulle top 30 parole per categoria di sentimento: anno 2022'
).interactive()
#scatter_plot_2022
#scatter_plot_2022.save('chart-wordsfrequency-2022.html')

In [34]:
# Concatenate the three DataFrames into a single DataFrame
df23words = pd.concat([top23pos, top23neg, top23neu], keys=['Positive', 'Negative', 'Neutral'])
options = ['Positive', 'Neutral', 'Negative']
labels = [option + ' ' for option in options]

# Define color domain and range
domain = ['pos', 'neu', 'neg']
range_ = ['#ADFC92', '#788BFF', '#F44E3F']

input_dropdown = alt.binding_radio(
    # Add the empty selection which shows all when clicked
    options=options + [None],
    labels=labels + ['All'],
    name='Selettore sentimento 2023: '
)

selection = alt.selection_point(
    fields=['Sentiment'],
    bind=input_dropdown,
)

# Reset the index of the concatenated DataFrame
df23words = df23words.reset_index(level=0).rename(columns={'level_0': 'Sentiment'})
alt.data_transformers.disable_max_rows()
scatter_plot_2023 = alt.Chart(df23words).mark_point(filled=True).encode(
    alt.X('word', title="Parola"),
    alt.Y('frequency', title="Frequenza").axis(labels=False),
    alt.Size('frequency', title="Frequenza assoluta"),
    alt.OpacityValue(0.7),
    alt.Color('Sentiment', title="Sentimento registrato").scale(domain=options, range=range_),
    tooltip = [alt.Tooltip('word:N', title="parola"),
               alt.Tooltip('frequency:Q', title="frequenza assoluta")
               #alt.Tooltip('Sentiment:Q')
              ]
).add_params(
    selection
).transform_filter(
    selection
).properties(
    width=600,  # Set the width of the chart to your desired value
    height=400,  # Set the height of the chart to your desired value
    title='Sentiment sulle top 30 parole per categoria di sentimento: anno 2023'
).interactive()
#scatter_plot_2023
#scatter_plot_2023.save('chart-wordsfrequency-2023.html')

In [35]:
finalchart_wordsfrequency_2022_2023 = alt.hconcat(scatter_plot_2022, scatter_plot_2023).resolve_scale(y='independent'
).configure_axisX(
    labelAngle=45,  # Specify the degree of inclination here
    labelAlign='left',  # Optional: Align the labels to the right for better readability
    labelPadding=15  # Optional: Add padding to the labels for better spacing
)
finalchart_wordsfrequency_2022_2023.save('wordsfreq_2022_2023.html')
finalchart_wordsfrequency_2022_2023

In [36]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.sentiment import SentimentIntensityAnalyzer

# Initialize the sentiment analysis model
sia = SentimentIntensityAnalyzer()

In [37]:
def get_top_n_ngrams(corpus, n=None, ngram=2, exclude_keywords=None):
    if exclude_keywords is None:
        exclude_keywords = []
        
    vec = CountVectorizer(ngram_range=(ngram, ngram), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)

    # escludo le parole forbidden (poco rilevanti)
    words_freq = [item for item in words_freq if not any(keyword in item[0] for keyword in exclude_keywords)]

    top_ngrams = words_freq[:n]
    return top_ngrams

# raggruppo i trigrammi per mese 
top_trigrams_by_month = df.groupby('Month')['tweetText_proc'].apply(lambda x: get_top_n_ngrams(x, n=10, ngram=3, exclude_keywords=['chat', 'gpt', 'chatgpt','httpstcorlyimpqw', 'ai', 'eth']))

# creo un df per metterci i trigrammi e il mese
df_top_trigrams = pd.DataFrame(top_trigrams_by_month.reset_index())

# espando la lista dei trigrammi su rows distinte
df_top_trigrams = df_top_trigrams.explode('tweetText_proc')

# Split the 'tweet_text' in trigram e frequency
df_top_trigrams[['Trigram', 'Frequency']] = pd.DataFrame(df_top_trigrams['tweetText_proc'].tolist(), index=df_top_trigrams.index)


df_top_trigrams = df_top_trigrams.drop(columns='tweetText_proc')


df_top_trigrams['Sentiment'] = df_top_trigrams['Trigram'].apply(lambda x: sia.polarity_scores(x)['compound'])

In [38]:
print(df_top_trigrams)

    Month                       Trigram  Frequency  Sentiment
0       1         large language models        304     0.0000
0       1   natural language processing        254     0.3612
0       1        medical licensing exam        216     0.0000
0       1  generated based instructions        195     0.0000
0       1         current temp humidity        189     0.0000
..    ...                           ...        ...        ...
8      12              launches poe way        148     0.0000
8      12            quora launches poe        145     0.0000
8      12             red google search        134     0.0000
8      12                 code red amid        132     0.0000
8      12              pros need master        130     0.0000

[90 rows x 4 columns]


In [39]:
months_mapping = {1: 'Gennaio', 2: 'Febbraio', 3: 'Marzo', 4: 'Aprile', 5: 'Maggio', 6: 'Giugno', 9: 'Settembre', 10:'Ottobre', 11:'Novembre', 12:'Dicembre'}

df_top_trigrams['MonthName'] = df_top_trigrams['Month'].map(months_mapping)
df_top_trigrams = df_top_trigrams.sort_values('MonthName')
df_top_trigrams

Unnamed: 0,Month,Trigram,Frequency,Sentiment,MonthName
3,4,banned italy privacy,146,-0.4588,Aprile
3,4,spot recommendation ticker,86,0.0000,Aprile
3,4,recommendation ticker time,86,0.0000,Aprile
3,4,ticker time interval,86,0.0000,Aprile
3,4,special edition guide,84,0.4019,Aprile
...,...,...,...,...,...
6,9,100 3000 week,2,0.0000,Settembre
6,9,home owner charged,2,-0.2023,Settembre
6,9,flying bussing illegals,2,0.0000,Settembre
6,9,trillions dollars members,1,0.0000,Settembre


In [42]:
#month_dropdown = alt.binding_select(options=df_top_trigrams['MonthName'].unique().tolist()+[None])
#month_sel = alt.selection_single(fields=['MonthName'], bind=month_dropdown, name="Seleziona", init={'MonthName': df_top_trigrams['MonthName'].iloc[0]})
optmont = ['Settembre','Ottobre','Dicembre','Gennaio', 'Febbraio', 'Marzo', 'Aprile', 'Maggio', 'Giugno']
labels = [option + ' ' for option in optmont]
month_dropdown = alt.binding_select(
    # Add the empty selection which shows all when clicked
    options=optmont + [None],
    labels=labels + ['All'],
    name='Selettore del periodo: '
)

month_sel = alt.selection_point(
    fields=['MonthName'],
    bind=month_dropdown,
)
#sentiment_scale = alt.Scale(domain=[-1, 1], range=['#FC2E20', '#FD7F20', '#FDB750'])
single = alt.selection_single(on='mouseover', nearest=True)

test = alt.Chart(df_top_trigrams, title=alt.Title('Top Trigrammi sul periodo 2022/2023',
    subtitle= 'Dati relativi al dataset di Tweets in lingua inglese')).mark_bar().encode(
    x=alt.X('Trigram:N', sort='-y', title='Trigramma'),
    y=alt.Y('Frequency:Q', title='Frequenza registrata'),
    color=alt.Color('Sentiment:Q'), #, scale=sentiment_scale),
    tooltip=[alt.Tooltip('Trigram:N', title="Trigramma"),
             alt.Tooltip('Frequency:Q', title="frequenza assoluta")]
).properties(
    width=800,
    height=600,
).configure_axisX(
    labelAngle=45,  # Specify the degree of inclination here
    labelAlign='left',  # Optional: Align the labels to the right for better readability
    labelPadding=15  # Optional: Add padding to the labels for better spacing
).add_selection(month_sel, single
).transform_filter(month_sel
).resolve_scale(y='independent'
).configure_legend(orient='top-right')


In [44]:
test.save('trigrammimensilitwitter.html')

In [None]:
df_exploded = df.explode('hashtags').reset_index(drop=True)

In [None]:
# Parse the month column as a datetime type
df_exploded['date'] = pd.to_datetime(df_exploded['date'])

In [None]:
df_counts = df_exploded.groupby(['hashtags', 'date']).size().reset_index(name='count')

In [None]:
df_counts