<h1>Notebook di prova per Altair</h1>

<h2>Caricamento delle librerie e il dataset</h2>

<p>Import delle librerie necessarie</p>

In [1]:
import pandas as pd
import altair as alt
import altair_viewer as av
from altair import datum
import re
import string
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk.tokenize import word_tokenize

In [2]:
alt.__version__

'5.0.0'

In [3]:
#alt.renderers.enable('notebook')

<p>carico il dataset</p>

In [4]:
df = pd.read_csv('reddit_altair.csv', index_col=0)

In [5]:
df.head(3)

Unnamed: 0,Post ID,Post author,Post title,Post URL,Post flair type,Post score,Post date,Comment ID,Comment author,Comment,Comment score,Comment date,Parent Comment ID,Parent comment author,Sentiment
1,13ucoev,TheHybred,If ChatGPT Cant Access The Internet Then How I...,https://www.reddit.com/r/ChatGPT/comments/13uc...,Jailbreak,4318,2023-05-29 00:10:45,jm090r5,sdmat,reason technical surprisingly nuanced Training...,2482,2023-05-29 01:42:37,13ucoev,TheHybred,pos
2,13ucoev,TheHybred,If ChatGPT Cant Access The Internet Then How I...,https://www.reddit.com/r/ChatGPT/comments/13uc...,Jailbreak,4318,2023-05-29 00:10:45,jm0k235,bojodrop,Slide jailbreak prompt,418,2023-05-29 03:13:53,13ucoev,TheHybred,neu
3,13ucoev,TheHybred,If ChatGPT Cant Access The Internet Then How I...,https://www.reddit.com/r/ChatGPT/comments/13uc...,Jailbreak,4318,2023-05-29 00:10:45,jm0h4ut,opi098514,Easy next line Shes old,631,2023-05-29 02:49:38,13ucoev,TheHybred,pos


<p>stampo info relative al dataset corrente</p>

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67978 entries, 1 to 68064
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Post ID                67978 non-null  object
 1   Post author            67978 non-null  object
 2   Post title             67978 non-null  object
 3   Post URL               67978 non-null  object
 4   Post flair type        62993 non-null  object
 5   Post score             67978 non-null  int64 
 6   Post date              67978 non-null  object
 7   Comment ID             67978 non-null  object
 8   Comment author         67978 non-null  object
 9   Comment                67410 non-null  object
 10  Comment score          67978 non-null  int64 
 11  Comment date           67978 non-null  object
 12  Parent Comment ID      67978 non-null  object
 13  Parent comment author  67978 non-null  object
 14  Sentiment              67978 non-null  object
dtypes: int64(2), object

<h2>Processing dei dati testuali</h2>

In [7]:
def pre_process(text):
    if isinstance(text, str):
        # Remove links
        text = re.sub(r'http\S+', '', text)

        # Remove mentions and hashtags
        text = re.sub(r'@\w+|#\w+', '', text)

        # Remove punctuation marks
        text = text.translate(str.maketrans('', '', string.punctuation))

        # Tokenize the text
        tokens = word_tokenize(text)

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word.lower() not in stop_words]

        # Remove empty and short words
        tokens = [word for word in tokens if len(word) > 2]

        # Join the tokens back into a single string
        cleaned_text = ' '.join(tokens)

        return cleaned_text
    else:
        return ''

In [8]:
#applico la pulizia del testo
df['Comment'] = df['Comment'].apply(pre_process)

In [9]:
df['Sentiment'].value_counts()

pos    32857
neu    18594
neg    16527
Name: Sentiment, dtype: int64

In [10]:
sentiment_mapping = {'pos': 'Positivo', 'neu': 'Neutrale', 'neg': 'Negativo'}

df['sentiment_complete'] = df['Sentiment'].map(sentiment_mapping)

In [11]:
df['sentiment_complete'].value_counts()

Positivo    32857
Neutrale    18594
Negativo    16527
Name: sentiment_complete, dtype: int64

<h2>Analisi del sentimento attraverso i mesi nei commenti di Reddit</h2>

In [12]:
df['Comment date'] = pd.to_datetime(df['Comment date'], format='%Y-%m-%d %H:%M:%S')

# Esegui la conversione nel formato desiderato "2023-02-05 23:59:16+00:00"
df['Comment date'] = df['Comment date'].dt.strftime('%Y-%m-%d %H:%M:%S+00:00')


# Convert 'date' column to datetime
df['Comment date'] = pd.to_datetime(df['Comment date'])

# Group by month and sentiment
df_grouped = df.groupby([df['Comment date'].dt.to_period('M'), 'sentiment_complete']).size().unstack()

# Convert index to string and extract month names
df_grouped.index = df_grouped.index.strftime('%B %Y')

# Reshape the dataframe for Altair
df_grouped = df_grouped.reset_index().melt('Comment date', var_name='sentiment_complete', value_name='Value')

# Define color domain and range
domain = ['Positivo', 'Negativo', 'Neutrale']
range_ = ['#ADFC92', '#F44E3F', '#788BFF']

# Create the Altair chart
chart = alt.Chart(df_grouped, title=alt.Title(
       "Evoluzione del sentimento verso ChatGPT nell'arco dei mesi",
       subtitle="Visualizzazione dei valori relativi al dataset di commenti"
   )).mark_line(opacity=0.1).encode(
    x=alt.X('Comment date:T', axis=alt.Axis(title='Periodo', format='%B %Y', tickCount=12)),
    y=alt.Y('Value:Q', axis=alt.Axis(title='Numero di commenti')),
    opacity=alt.value(0.9),
    color=alt.Color('sentiment_complete',
                    title="Sentimento registrato",
                    scale=alt.Scale(domain=domain, range=range_),
                    legend=alt.Legend(orient="right"))
).configure_legend(
    orient='top'
).configure_axis(
    grid=False
).properties(
    width=800,
    height=600
).interactive()

chart.save('chartSentimentING_reddit.html')
chart

  df_grouped = df.groupby([df['Comment date'].dt.to_period('M'), 'sentiment_complete']).size().unstack()


In [13]:
df_grouped

Unnamed: 0,Comment date,sentiment_complete,Value
0,April 2023,Negativo,4717
1,May 2023,Negativo,9471
2,June 2023,Negativo,2339
3,April 2023,Neutrale,5138
4,May 2023,Neutrale,10454
5,June 2023,Neutrale,3002
6,April 2023,Positivo,9626
7,May 2023,Positivo,18334
8,June 2023,Positivo,4897


In [14]:
#prendo il dataset, cerco le frequenze delle parole per sentimento (=risultato: tre dataframe, uno per sentimento e relativi a un anno.)
#creo un dataset con 2 colonne (uno per ogni sentimento), le colonne sono: word, frequency
#creo una visualizzazione scatter plot con colore abbinato al sentimento corrente
#mostro tutti i puntini usando un Alt che in descrizione riporta il valore label di word
#mostro una legenda con le top 20 più frequenti

In [15]:
df['Month'] = df['Comment date'].dt.month

In [16]:
def extract_year(value):
    return df[df['Month']==value]

april = extract_year(4)
may = extract_year(5)
june = extract_year(6)

In [17]:
#prendo il dataset, cerco le frequenze delle parole per sentimento.
def words_finder(df, sentiment):
    tokens = []
    df = df[(df['sentiment_complete'] == sentiment)]
    for row in df['Comment']:
        tokens.extend(word_tokenize(str(row).lower()))  # Tokenization and lower conversion of the strings

    word_freq = FreqDist(tokens)
    sorted_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
    banned_w = ['chatgpt', 'ai', ' ai', 'ai ', 'gpt', 'openai', 'artificialintelligence', 'amp', 'https', '2022', '2023', 'also', 'still', 'could'] #words to filter out
    sorted_freq = [(word, freq) for word, freq in sorted_freq if word not in banned_w]
    return sorted_freq #frequencies sorted for the choosen sentiment

def createDffreqs(sorted_freq, sentiment): #ritorna un dataframe con una colonna contenente la parola ed una con la sua frequenza assoluta
    freq_df = pd.DataFrame(sorted_freq, columns=['Parola', 'Frequenza'])
    freq_df.sort_values(by='Frequenza', ascending=False, inplace=True)
    freq_df['sentiment_complete'] = sentiment
    return freq_df

In [18]:
aprilpos = createDffreqs(words_finder(april, 'Positivo'), 'Positivo').head(30)
maypos = createDffreqs(words_finder(may, 'Positivo'), 'Positivo').head(30)
junepos = createDffreqs(words_finder(june, 'Positivo'), 'Positivo').head(30)
aprilneg = createDffreqs(words_finder(april, 'Negativo'), 'Negativo').head(30)
mayneg = createDffreqs(words_finder(may, 'Negativo'), 'Negativo').head(30)
juneneg = createDffreqs(words_finder(june, 'Negativo'), 'Negativo').head(30)
aprilneu = createDffreqs(words_finder(april, 'Neutrale'), 'Neutrale').head(30)
mayneu = createDffreqs(words_finder(may, 'Neutrale'), 'Neutrale').head(30)
juneneu = createDffreqs(words_finder(june, 'Neutrale'), 'Neutrale').head(30)

In [19]:
juneneu

Unnamed: 0,Parola,Frequenza,sentiment_complete
0,use,123,Neutrale
1,deleted,117,Neutrale
2,would,92,Neutrale
3,thats,88,Neutrale
4,one,84,Neutrale
5,dont,77,Neutrale
6,bro,72,Neutrale
7,chat,71,Neutrale
8,think,71,Neutrale
9,work,71,Neutrale


In [20]:
# Concatenate the three DataFrames into a single DataFrame
#df_copy = pd.concat([aprilpos, aprilneg, aprilneu], keys=['Positive', 'Negative', 'Neutral'])
#options = ['Positive', 'Neutral', 'Negative']
#labels = [option + ' ' for option in options]

# Define color domain and range
#domain = ['pos', 'neu', 'neg']
#range_ = ['#ADFC92', '#788BFF', '#F44E3F']

#input_dropdown = alt.binding_radio(
    # Add the empty selection which shows all when clicked
#    options=options + [None],
#    labels=labels + ['All'],
#    name='Selettore sentimento: '
#)

#selection = alt.selection_point(
#    fields=['Sentiment'],
#    bind=input_dropdown,
#)

# Reset the index of the concatenated DataFrame
#df_copy = df_copy.reset_index(level=0).rename(columns={'level_0': 'Sentiment'})
#alt.data_transformers.disable_max_rows()
#scatter_plot_april = alt.Chart(df_copy).mark_point(filled=True).encode(
#    alt.X('word',title="Parola"),
#    alt.Y('frequency', title="Frequenza").axis(labels=False),
#    alt.Size('frequency'),
#    alt.OpacityValue(0.7),
#    alt.Color('Sentiment', title="Sentimento registrato").scale(domain=options, range=range_),
#    tooltip = [alt.Tooltip('word:N'),
#               alt.Tooltip('frequency:Q'),
#               alt.Tooltip('frequency:Q'),
#               #alt.Tooltip('Sentiment:Q')
#              ]
#).add_params(
#    selection
#).transform_filter(
#    selection
#).properties(
#    width=600,  # Set the width of the chart to your desired value
#    height=400,  # Set the height of the chart to your desired value
#    title='Sentiment sulle top 30 parole per categoria di sentimento: Aprile'
#).interactive()
#scatter_plot_april
#scatter_plot_april.save('chart-wordsfrequency-april.html')

In [21]:
# Concatenate the three DataFrames into a single DataFrame
#df_copy = pd.concat([maypos, mayneg, mayneu], keys=['Positive', 'Negative', 'Neutral'])
#options = ['Positive', 'Neutral', 'Negative']
#labels = [option + ' ' for option in options]

# Define color domain and range
#domain = ['pos', 'neu', 'neg']
#range_ = ['#ADFC92', '#788BFF', '#F44E3F']

#input_dropdown = alt.binding_radio(
    # Add the empty selection which shows all when clicked
#    options=options + [None],
#    labels=labels + ['All'],
#    name='Selettore sentimento: '
#)

#selection = alt.selection_point(
#    fields=['Sentiment'],
#    bind=input_dropdown,
#)

# Reset the index of the concatenated DataFrame
#df_copy = df_copy.reset_index(level=0).rename(columns={'level_0': 'Sentiment'})
#alt.data_transformers.disable_max_rows()
#scatter_plot_may = alt.Chart(df_copy).mark_point(filled=True).encode(
#    alt.X('word',title="Parola"),
#    alt.Y('frequency', title="Frequenza").axis(labels=False),
#    alt.Size('frequency'),
#    alt.OpacityValue(0.7),
#    alt.Color('Sentiment', title="Sentimento registrato").scale(domain=options, range=range_),
#    tooltip = [alt.Tooltip('word:N'),
#               alt.Tooltip('frequency:Q'),
#               alt.Tooltip('frequency:Q'),
               #alt.Tooltip('Sentiment:Q')
#              ]
#).add_params(
#    selection
#).transform_filter(
#    selection
#).properties(
#    width=600,  # Set the width of the chart to your desired value
#    height=400,  # Set the height of the chart to your desired value
#    title='Sentiment sulle top 30 parole per categoria di sentimento: Maggio'
#).interactive()
#scatter_plot_may
#scatter_plot_may.save('chart-wordsfrequency-may.html')

In [22]:
# Concatenate the three DataFrames into a single DataFrame
#df_copy = pd.concat([junepos, juneneg, juneneu], keys=['Positive', 'Negative', 'Neutral'])
#options = ['Positive', 'Neutral', 'Negative']
#labels = [option + ' ' for option in options]

# Define color domain and range
#domain = ['pos', 'neu', 'neg']
#range_ = ['#ADFC92', '#788BFF', '#F44E3F']

#input_dropdown = alt.binding_radio(
    # Add the empty selection which shows all when clicked
#    options=options + [None],
#    labels=labels + ['All'],
#    name='Selettore sentimento: '
#)

#selection = alt.selection_point(
#    fields=['Sentiment'],
#    bind=input_dropdown,
#)

# Reset the index of the concatenated DataFrame
#df_copy = df_copy.reset_index(level=0).rename(columns={'level_0': 'Sentiment'})
#alt.data_transformers.disable_max_rows()
#scatter_plot_june = alt.Chart(df_copy).mark_point(filled=True).encode(
#    alt.X('word',title="Parola"),
#    alt.Y('frequency', title="Frequenza").axis(labels=False),
#    alt.Size('frequency'),
#    alt.OpacityValue(0.7),
#    alt.Color('Sentiment', title="Sentimento registrato").scale(domain=options, range=range_),
#    tooltip = [alt.Tooltip('word:N'),
#               alt.Tooltip('frequency:Q'),
#               alt.Tooltip('frequency:Q'),
               #alt.Tooltip('Sentiment:Q')
#              ]
#).add_params(
#    selection
#).transform_filter(
#    selection
#).properties(
#    width=600,  # Set the width of the chart to your desired value
#    height=400,  # Set the height of the chart to your desired value
#    title='Sentiment sulle top 30 parole per categoria di sentimento: Giugno'
#).interactive()
#scatter_plot_june
#scatter_plot_june.save('chart-wordsfrequency-june.html')

In [23]:
df_copy = pd.concat([aprilpos, maypos, junepos, aprilneg, mayneg, juneneg, aprilneu, mayneu, juneneu])

df_copy = df_copy.groupby(['sentiment_complete', 'Parola']).agg({'Frequenza': 'sum'}).reset_index().set_index('sentiment_complete')

options = ['Positivo', 'Neutrale', 'Negativo']
labels = [option + ' ' for option in options]

domain = ['Positivo', 'Neutrale', 'Negativo']
range_ = ['#ADFC92', '#788BFF', '#F44E3F']

input_dropdown = alt.binding_radio(
    options=options + [None],
    labels=labels + ['Tutto'],
    name='Selettore sentimento: '
)

selection = alt.selection_point(
    fields=['sentiment_complete'],
    bind=input_dropdown,
)

df_copy = df_copy.reset_index(level=0).rename(columns={'level_0': 'sentiment_complete'})
alt.data_transformers.disable_max_rows()
finalwordschart = alt.Chart(df_copy).mark_point(filled=True).encode(
    alt.X('Parola',title="Parola"),
    alt.Y('Frequenza', title="Frequenza").axis(labels=False),
    alt.Size('Frequenza'),
    alt.OpacityValue(0.7),
    alt.Color('sentiment_complete', title="Sentimento registrato").scale(domain=options, range=range_),
    tooltip = [alt.Tooltip('Parola:N'),
               alt.Tooltip('Frequenza:Q'),
               alt.Tooltip('Frequenza:Q'),
               #alt.Tooltip('Sentiment:Q')
              ]
).add_params(
    selection
).add_selection(
    selection
).configure_axisX(
    labelAngle=45
).transform_filter(
    selection
).properties(
    width=800,  # Set the width of the chart to your desired value (600)
    height=600,  # Set the height of the chart to your desired value (400)
    title='Sentimento sulle 30 parole più frequenti nel periodo aprile-giugno 2023'
).interactive()
finalwordschart.save('wordsfreq_allmonths.html')
finalwordschart



In [25]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.sentiment import SentimentIntensityAnalyzer

# Initialize the sentiment analysis model
sia = SentimentIntensityAnalyzer()

In [26]:
def get_top_n_ngrams(corpus, n=None, ngram=2, exclude_keywords=None):
    if exclude_keywords is None:
        exclude_keywords = []
        
    vec = CountVectorizer(ngram_range=(ngram, ngram), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)

    # escludo le parole forbidden (poco rilevanti)
    words_freq = [item for item in words_freq if not any(keyword in item[0] for keyword in exclude_keywords)]

    top_ngrams = words_freq[:n]
    return top_ngrams

# raggruppo i trigrammi per mese 
top_trigrams_by_month = df.groupby('Month')['Comment'].apply(lambda x: get_top_n_ngrams(x, n=10, ngram=3, exclude_keywords=['chat', 'gpt', 'chatgpt','httpstcorlyimpqw', 'ai', 'eth']))

# creo un df per metterci i trigrammi e il mese
df_top_trigrams = pd.DataFrame(top_trigrams_by_month.reset_index())

# espando la lista dei trigrammi su rows distinte
df_top_trigrams = df_top_trigrams.explode('Comment')

# Split the 'tweet_text' in trigram e frequency
df_top_trigrams[['Trigram', 'Frequency']] = pd.DataFrame(df_top_trigrams['Comment'].tolist(), index=df_top_trigrams.index)


df_top_trigrams = df_top_trigrams.drop(columns='Comment')


df_top_trigrams['Sentimento'] = df_top_trigrams['Trigram'].apply(lambda x: sia.polarity_scores(x)['compound'])

In [27]:
print(df_top_trigrams)

   Month                         Trigram  Frequency  Sentimento
0      4            smart robot response        217      0.4019
0      4        robot response automatic        217      0.0000
0      4               heres think based        110      0.0000
0      4            think based comments        107      0.0000
0      4             based comments like         60      0.3612
0      4                good sense humor         47      0.6124
0      4             smart robot summary         43      0.4019
0      4         robot summary automatic         43      0.0000
0      4          summary automatic tldr         43      0.0000
0      4          automatic tldr shorter         43      0.0000
1      5                ich verstehe nur         74      0.0000
1      5            large language model         69      0.0000
1      5            verstehe nur bahnhof         63      0.0000
1      5                   das kommt mir         62      0.0000
1      5              kommt mir spanisch

In [28]:
months_mapping = {4: 'Aprile', 5: 'Maggio', 6: 'Giugno'}

df_top_trigrams['MonthName'] = df_top_trigrams['Month'].map(months_mapping)
df_top_trigrams = df_top_trigrams.sort_values('MonthName')
df_top_trigrams

Unnamed: 0,Month,Trigram,Frequency,Sentimento,MonthName
0,4,smart robot response,217,0.4019,Aprile
0,4,automatic tldr shorter,43,0.0,Aprile
0,4,robot summary automatic,43,0.0,Aprile
0,4,smart robot summary,43,0.4019,Aprile
0,4,good sense humor,47,0.6124,Aprile
0,4,summary automatic tldr,43,0.0,Aprile
0,4,think based comments,107,0.0,Aprile
0,4,heres think based,110,0.0,Aprile
0,4,robot response automatic,217,0.0,Aprile
0,4,based comments like,60,0.3612,Aprile


In [30]:
#month_dropdown = alt.binding_select(options=df_top_trigrams['MonthName'].unique().tolist()+[None])
#month_sel = alt.selection_single(fields=['MonthName'], bind=month_dropdown, name="Seleziona", init={'MonthName': df_top_trigrams['MonthName'].iloc[0]})
optmont = ['Aprile', 'Maggio', 'Giugno']
labels = [option + ' ' for option in optmont]
month_dropdown = alt.binding_select(
    # Add the empty selection which shows all when clicked
    options=optmont + [None],
    labels=labels + ['Tutto'],
    name='Selettore del periodo: '
)

month_sel = alt.selection_point(
    fields=['MonthName'],
    bind=month_dropdown,
)
#sentiment_scale = alt.Scale(domain=[-1, 1], range=['#FC2E20', '#FD7F20', '#FDB750'])
tooltip = alt.selection_single(fields=['Trigram', 'Frequency'])

test = alt.Chart(df_top_trigrams, title=alt.Title('Top Trigrammi sul periodo aprile-giugno 2023',
    subtitle= 'Dati relativi al dataset di Reddit')).mark_bar().encode(
    x=alt.X('Trigram:N', sort='-y', title='Trigramma'),
    y=alt.Y('Frequency:Q', title='Frequenza registrata'),
    color=alt.Color('Sentimento:Q'), #, scale=sentiment_scale),
    opacity=alt.condition(tooltip, alt.value(1), alt.value(0.3)),
    tooltip=[alt.Tooltip('Trigram:N', title="Trigramma"),
             alt.Tooltip('Frequency:Q', title="Frequenza assoluta")]
).properties(
    width=800,
    height=600,
).configure_axisX(
    labelAngle=45,  # Specify the degree of inclination here
    labelAlign='left',  # Optional: Align the labels to the right for better readability
    labelPadding=15  # Optional: Add padding to the labels for better spacing
).add_selection(month_sel, tooltip
).transform_filter(month_sel
).resolve_scale(y='independent'
).configure_legend(orient='top-right').interactive()

test.save('trigrammimensilireddit.html')
test

