In [1]:
#Import libraries
import pandas as pd
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import spacy

In [2]:
#Import the dataset
df = pd.read_csv("transcriptions_2.csv") #transcriptions
df1=pd.read_csv("all_commercials_classified_filtered.csv") # Nice_categories

In [3]:
#Join the two datasets
df_total=pd.concat([df.set_index('commercial_id'),df1.set_index('commercial_id')], axis=1, join='inner') #Join on commercial_id
#Keep only the transcription with log_prob >=-0.5
df_total=df_total.loc[df_total['log_prob']>=-0.5]

#Remove all the Nan values of nice_class, they are marked by -1
df_total=df_total.loc[df_total['nice_class']>-1]

#Select only the columns of interest: transcriptions (of the descriptions) and nice_class
df_total.drop('title', axis=1, inplace=True)
df_total.drop('log_prob', axis=1, inplace=True)

In [4]:
#Remove the punctuation from the transcriptions
def remove_punctuation(text):
    if pd.isna(text):
        return text
    punctuationfree="".join([i for i in str(text) if i not in string.punctuation])
    return punctuationfree

df_total['transcription'] = df_total['transcription'].apply(remove_punctuation)

#Apply the lower method to strings

df_total['transcription'] = df_total['transcription'].apply(lambda s: s.lower() if type(s) == str else s)

#Remove the stopwords
stopwords = stopwords.words('italian')
stopwords_plus=['cè', 'così', 'oh', 'eh', 'sì', 'to', 'po', 'già', 'mai', 'no', 'ce', 'ah', 'allora', 'me', 'doro']
for element in stopwords_plus:
    stopwords.append(element)
df_total['transcription'] = df_total['transcription'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in (stopwords)]))

In [5]:
#Lemmatization
nlp = spacy.load("it_core_news_sm")
def lemmatize(testo):
    doc = nlp(testo)
    lemmi = [token.lemma_ for token in doc]
    return " ".join(lemmi)
df_total['transcription']=df_total['transcription'].apply(lemmatize)

In [6]:
df_filtered=(df_total[['transcription', 'nice_class', 'lustrum']]).copy()

In [7]:
df_filtered.head()

Unnamed: 0_level_0,transcription,nice_class,lustrum
commercial_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
_3siiuQ3kp0.s5,nascere nuovo fiorino franco Hugo stabilire nu...,12,1990_1994
_LQYpDfplFE,caldo benvenuto fantastico stadio cominciare l...,25,2000_2004
_qPReXLa870,bottiglio coco cola piacento allo chiedila bar...,32,2015_2019
_RIBAprIKQc,sottotitolo creare comunità Amaraorg,14,2010_2014
_VxI8Bo64wQ.s5,ciao parlare parlare ascoltavo messaggio ragaz...,38,2000_2004


# LDA for Nice_class

In [8]:
df_filtered=(df_total[['transcription', 'nice_class']]).copy()

In [9]:
dictionary = {}

for nice_class, group_df in df_filtered.groupby('nice_class'):
    nice_transcriptions = []
    for index, row in group_df.iterrows():
        nice_transcriptions.append(row['transcription'])
    dictionary[nice_class] = nice_transcriptions


In [10]:
X_train=dictionary[3.0]
tf_vectorizer = CountVectorizer()#max_df=0.5, min_df=2,max_features = 1000, ngram_range=(1,2))
tf = tf_vectorizer.fit_transform(X_train)

In [11]:
n_components = 3
lda = LatentDirichletAllocation(n_components=n_components, max_iter=10,
                                learning_method = 'batch',
                                n_jobs=-1,verbose=1)
lda.fit(tf)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [12]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = f'Topic {topic_idx}: '
        message += ', '.join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [13]:
n_top_words = 10
tf_feature_names = tf_vectorizer.get_feature_names_out()
print_top_words(lda, tf_feature_names, n_top_words)

Topic 0: the, best, like, lux, isre, senza, bluo, mentadent, pelle, bello
Topic 1: nivea, hair, care, vale, segno, lasciare, parola, essere, mus, profumo
Topic 2: sintony, nuovo, studio, capello, vanish, macchia, piatto, nelsen, ace, egoista



In [14]:
import altair as alt

# Funzione per preparare i dati
def get_topic_words_df(lda_model, feature_names, n_top_words):
    data = []
    for topic_idx, topic in enumerate(lda_model.components_):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]
        
        for feature, weight in zip(top_features, weights):
            data.append({"Topic": topic_idx, "Word": feature, "Weight": weight})
    
    return pd.DataFrame(data)

# Preparazione dei dati per Altair
n_top_words = 10
tf_feature_names = tf_vectorizer.get_feature_names_out()
df_topic_words = get_topic_words_df(lda, tf_feature_names, n_top_words)

In [15]:
n_top_words = 10
tf_feature_names = tf_vectorizer.get_feature_names_out()
df_topic_words = get_topic_words_df(lda, tf_feature_names, n_top_words)
chart = alt.Chart(df_topic_words).mark_bar().encode(
    x=alt.X('Weight:Q', title='Weight'),
    y=alt.Y('Word:N', sort='-x', title='Word'),
    color=alt.Color('Topic:N', scale=alt.Scale(scheme='category10')),
    column=alt.Column('Topic:N', title='Topic', header=alt.Header(titleOrient='bottom'))
).properties(
    width=200,
    height=200,
    title='Top Words per Topic for Nice class 3'
).configure_axis(
    grid=False
).configure_view(
    strokeWidth=0
)

chart.display()


Hint: Instead of e.g. `is_pandas_dataframe(df)`, did you mean `is_pandas_dataframe(df.to_native())`?
  return _is_pandas_dataframe(obj) or isinstance(


In [16]:
X_train=dictionary[5.0]
tf_vectorizer = CountVectorizer()#max_df=0.5, min_df=2,max_features = 1000, ngram_range=(1,2))
tf = tf_vectorizer.fit_transform(X_train)

In [17]:
n_components = 3
lda = LatentDirichletAllocation(n_components=n_components, max_iter=10,
                                learning_method = 'batch',
                                n_jobs=-1,verbose=1)
lda.fit(tf)

n_top_words = 10
tf_feature_names = tf_vectorizer.get_feature_names_out()
print_top_words(lda, tf_feature_names, n_top_words)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
Topic 0: potere, male, testa, effetto, provare, molly, nuovo, avere, sottotitolo, comunità
Topic 1: gola, zerinol, creare, infiammazione, vox, dolore, due, sazio, forza, polase
Topic 2: comes, the, feeling, here, menta, tasta, saco, there, can, foglio



In [18]:
n_top_words = 10
tf_feature_names = tf_vectorizer.get_feature_names_out()
df_topic_words = get_topic_words_df(lda, tf_feature_names, n_top_words)
chart = alt.Chart(df_topic_words).mark_bar().encode(
    x=alt.X('Weight:Q', title='Weight'),
    y=alt.Y('Word:N', sort='-x', title='Word'),
    color=alt.Color('Topic:N', scale=alt.Scale(scheme='category10')),
    column=alt.Column('Topic:N', title='Topic', header=alt.Header(titleOrient='bottom'))
).properties(
    width=200,
    height=200,
    title='Top Words per Topic for Nice class 5'
).configure_axis(
    grid=False
).configure_view(
    strokeWidth=0
)

chart.display()


Hint: Instead of e.g. `is_pandas_dataframe(df)`, did you mean `is_pandas_dataframe(df.to_native())`?
  return _is_pandas_dataframe(obj) or isinstance(


In [19]:
X_train=dictionary[30.0]
tf_vectorizer = CountVectorizer()#max_df=0.5, min_df=2,max_features = 1000, ngram_range=(1,2))
tf = tf_vectorizer.fit_transform(X_train)

In [20]:
n_components = 3
lda = LatentDirichletAllocation(n_components=n_components, max_iter=10,
                                learning_method = 'batch',
                                n_jobs=-1,verbose=1)
lda.fit(tf)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [21]:
n_top_words = 10
tf_feature_names = tf_vectorizer.get_feature_names_out()
print_top_words(lda, tf_feature_names, n_top_words)

Topic 0: रभ, gnamo, kinder, latte, natale, festare, neanche, aa, nonno, vedere
Topic 1: fruit, oro, and, algido, music, riso, givo, love, my, scotto
Topic 2: ci, essere, fonzies, quando, daygum, we, flora, riso, tanto, fresco



In [22]:
n_top_words = 10
tf_feature_names = tf_vectorizer.get_feature_names_out()
df_topic_words = get_topic_words_df(lda, tf_feature_names, n_top_words)
chart = alt.Chart(df_topic_words).mark_bar().encode(
    x=alt.X('Weight:Q', title='Weight'),
    y=alt.Y('Word:N', sort='-x', title='Word'),
    color=alt.Color('Topic:N', scale=alt.Scale(scheme='category10')),
    column=alt.Column('Topic:N', title='Topic', header=alt.Header(titleOrient='bottom'))
).properties(
    width=200,
    height=200,
    title='Top Words per Topic for Nice class 30'
).configure_axis(
    grid=False
).configure_view(
    strokeWidth=0
)

chart.display()


Hint: Instead of e.g. `is_pandas_dataframe(df)`, did you mean `is_pandas_dataframe(df.to_native())`?
  return _is_pandas_dataframe(obj) or isinstance(


In [23]:
X_train=dictionary[29.0]
tf_vectorizer = CountVectorizer()#max_df=0.5, min_df=2,max_features = 1000, ngram_range=(1,2))
tf = tf_vectorizer.fit_transform(X_train)

n_components = 3
lda = LatentDirichletAllocation(n_components=n_components, max_iter=10,
                                learning_method = 'batch',
                                n_jobs=-1,verbose=1)
lda.fit(tf)


n_top_words = 10
tf_feature_names = tf_vectorizer.get_feature_names_out()
print_top_words(lda, tf_feature_names, n_top_words)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
Topic 0: creare, comunità, amaraorg, sottotitolo, adesso, rovagnare, wwwamaraorg, qualcosa, altro, uscire
Topic 1: mamma, te, mare, buono, pane, biscotto, gran, arrivo, pronto, solo
Topic 2: activia, sentire, giorno, dopo, tonno, qui, bello, oggi, andare, fare



In [24]:
n_top_words = 10
tf_feature_names = tf_vectorizer.get_feature_names_out()
df_topic_words = get_topic_words_df(lda, tf_feature_names, n_top_words)
chart = alt.Chart(df_topic_words).mark_bar().encode(
    x=alt.X('Weight:Q', title='Weight'),
    y=alt.Y('Word:N', sort='-x', title='Word'),
    color=alt.Color('Topic:N', scale=alt.Scale(scheme='category10')),
    column=alt.Column('Topic:N', title='Topic', header=alt.Header(titleOrient='bottom'))
).properties(
    width=200,
    height=200,
    title='Top Words per Topic for Nice class 29'
).configure_axis(
    grid=False
).configure_view(
    strokeWidth=0
)

chart.display()


Hint: Instead of e.g. `is_pandas_dataframe(df)`, did you mean `is_pandas_dataframe(df.to_native())`?
  return _is_pandas_dataframe(obj) or isinstance(


## LDA only by lustrum

In [25]:
df_filtered=(df_total[['transcription', 'lustrum']]).copy()

In [26]:
dictionary = {}

for nice_class, group_df in df_filtered.groupby('lustrum'):
    nice_transcriptions = []
    for index, row in group_df.iterrows():
        nice_transcriptions.append(row['transcription'])
    dictionary[nice_class] = nice_transcriptions

In [27]:
X_train=dictionary['1980_1984']
tf_vectorizer = CountVectorizer()#max_df=0.5, min_df=2,max_features = 1000, ngram_range=(1,2))
tf = tf_vectorizer.fit_transform(X_train)

In [28]:
n_components = 3
lda = LatentDirichletAllocation(n_components=n_components, max_iter=10,
                                learning_method = 'batch',
                                n_jobs=-1,verbose=1)
lda.fit(tf)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [29]:
n_top_words = 10
tf_feature_names = tf_vectorizer.get_feature_names_out()
print_top_words(lda, tf_feature_names, n_top_words)

Topic 0: lux, pelle, buono, fare, cura, bello, sazio, pasqua, fresco, molto
Topic 1: piatto, nelsen, chilo, milione, lira, pagare, mese, mila, sottotitolo, comunità
Topic 2: sintony, apprezzo, scotch, renault, larco, largo, fare, ballantine, conosce, apprezzi



In [30]:
import altair as alt

# Funzione per preparare i dati
def get_topic_words_df(lda_model, feature_names, n_top_words):
    data = []
    for topic_idx, topic in enumerate(lda_model.components_):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]
        
        for feature, weight in zip(top_features, weights):
            data.append({"Topic": topic_idx, "Word": feature, "Weight": weight})
    
    return pd.DataFrame(data)

# Preparazione dei dati per Altair
n_top_words = 10
tf_feature_names = tf_vectorizer.get_feature_names_out()
df_topic_words = get_topic_words_df(lda, tf_feature_names, n_top_words)

In [31]:
chart = alt.Chart(df_topic_words).mark_bar().encode(
    x=alt.X('Weight:Q', title='Weight'),
    y=alt.Y('Word:N', sort='-x', title='Word'),
    color=alt.Color('Topic:N', scale=alt.Scale(scheme='category10')),
    column=alt.Column('Topic:N', title='Topic', header=alt.Header(titleOrient='bottom'))
).properties(
    width=200,
    height=200,
    title='Top Words per Topic for lustrum 1980_1984'
).configure_axis(
    grid=False
).configure_view(
    strokeWidth=0
)

chart.display()


Hint: Instead of e.g. `is_pandas_dataframe(df)`, did you mean `is_pandas_dataframe(df.to_native())`?
  return _is_pandas_dataframe(obj) or isinstance(


In [32]:
X_train=dictionary['1985_1989']
tf_vectorizer = CountVectorizer()#max_df=0.5, min_df=2,max_features = 1000, ngram_range=(1,2))
tf = tf_vectorizer.fit_transform(X_train)

In [33]:
n_components = 3
lda = LatentDirichletAllocation(n_components=n_components, max_iter=10,
                                learning_method = 'batch',
                                n_jobs=-1,verbose=1)
lda.fit(tf)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [34]:
n_top_words = 10
tf_feature_names = tf_vectorizer.get_feature_names_out()
print_top_words(lda, tf_feature_names, n_top_words)

Topic 0: red, lontano, amaraorg, creare, sottotitolo, comunità, momento, solo, nidra, de
Topic 1: रभ, computer, studio, center, dolce, line, potere, amara, valle, verde
Topic 2: the, its, henninger, fruit, tv9, sprinter, fare, givo, comes, gusto



In [35]:
n_top_words = 10
tf_feature_names = tf_vectorizer.get_feature_names_out()
df_topic_words = get_topic_words_df(lda, tf_feature_names, n_top_words)
chart = alt.Chart(df_topic_words).mark_bar().encode(
    x=alt.X('Weight:Q', title='Weight'),
    y=alt.Y('Word:N', sort='-x', title='Word'),
    color=alt.Color('Topic:N', scale=alt.Scale(scheme='category10')),
    column=alt.Column('Topic:N', title='Topic', header=alt.Header(titleOrient='bottom'))
).properties(
    width=200,
    height=200,
    title='Top Words per Topic for lustrum 1985_1989'
).configure_axis(
    grid=False
).configure_view(
    strokeWidth=0
)

chart.display()


Hint: Instead of e.g. `is_pandas_dataframe(df)`, did you mean `is_pandas_dataframe(df.to_native())`?
  return _is_pandas_dataframe(obj) or isinstance(


In [36]:
X_train=dictionary['2000_2004']
tf_vectorizer = CountVectorizer()#max_df=0.5, min_df=2,max_features = 1000, ngram_range=(1,2))
tf = tf_vectorizer.fit_transform(X_train)

In [37]:
n_components = 3
lda = LatentDirichletAllocation(n_components=n_components, max_iter=10,
                                learning_method = 'batch',
                                n_jobs=-1,verbose=1)
lda.fit(tf)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [38]:
n_top_words = 10
tf_feature_names = tf_vectorizer.get_feature_names_out()
print_top_words(lda, tf_feature_names, n_top_words)

Topic 0: ricarico, get, parolo, on, board, parola, solo, and, euro, casa
Topic 1: lancia, lybro, 2004, motore, scoprite, tecnologia, common, 1900, rail, gtd
Topic 2: sottotitolo, creare, comunità, amaraorg, mela, te, amore, vero, neanche, latte



In [39]:
n_top_words = 10
tf_feature_names = tf_vectorizer.get_feature_names_out()
df_topic_words = get_topic_words_df(lda, tf_feature_names, n_top_words)
chart = alt.Chart(df_topic_words).mark_bar().encode(
    x=alt.X('Weight:Q', title='Weight'),
    y=alt.Y('Word:N', sort='-x', title='Word'),
    color=alt.Color('Topic:N', scale=alt.Scale(scheme='category10')),
    column=alt.Column('Topic:N', title='Topic', header=alt.Header(titleOrient='bottom'))
).properties(
    width=200,
    height=200,
    title='Top Words per Topic for lustrum 2000_2004'
).configure_axis(
    grid=False
).configure_view(
    strokeWidth=0
)

chart.display()


Hint: Instead of e.g. `is_pandas_dataframe(df)`, did you mean `is_pandas_dataframe(df.to_native())`?
  return _is_pandas_dataframe(obj) or isinstance(


In [40]:
X_train=dictionary['2020_2024']
tf_vectorizer = CountVectorizer()#max_df=0.5, min_df=2,max_features = 1000, ngram_range=(1,2))
tf = tf_vectorizer.fit_transform(X_train)

In [41]:
n_components = 3
lda = LatentDirichletAllocation(n_components=n_components, max_iter=10,
                                learning_method = 'batch',
                                n_jobs=-1,verbose=1)
lda.fit(tf)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [42]:
n_top_words = 10
tf_feature_names = tf_vectorizer.get_feature_names_out()
print_top_words(lda, tf_feature_names, n_top_words)

Topic 0: you, have, praise, sottotitolo, sapere, can, ds, cosa, bello, the
Topic 1: yeah, vale, segno, lasciare, parola, futuro, creare, due, riso, flora
Topic 2: essere, ci, fantasma, vedere, te, ladro, grazie, ghostbusters, quando, sempre



In [43]:
n_top_words = 10
tf_feature_names = tf_vectorizer.get_feature_names_out()
df_topic_words = get_topic_words_df(lda, tf_feature_names, n_top_words)
chart = alt.Chart(df_topic_words).mark_bar().encode(
    x=alt.X('Weight:Q', title='Weight'),
    y=alt.Y('Word:N', sort='-x', title='Word'),
    color=alt.Color('Topic:N', scale=alt.Scale(scheme='category10')),
    column=alt.Column('Topic:N', title='Topic', header=alt.Header(titleOrient='bottom'))
).properties(
    width=200,
    height=200,
    title='Top Words per Topic for lustrum 2020_2024'
).configure_axis(
    grid=False
).configure_view(
    strokeWidth=0
)

chart.display()


Hint: Instead of e.g. `is_pandas_dataframe(df)`, did you mean `is_pandas_dataframe(df.to_native())`?
  return _is_pandas_dataframe(obj) or isinstance(
