# Loading packages

In [2]:
import pandas as pd
import os
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
import altair as alt
import numpy as np
from sklearn.cluster import DBSCAN, HDBSCAN
from umap import UMAP
import fasttext
import fasttext.util

# Looping through the dataframes, creating label dummies and text vectors.
### You can jump to the 'Clustering' part and use the corresponding merged dataframe

In [2]:
strings_to_check = ['Belföld', 'Külföld', 'Gazdaság', 'Életmód', 'Sport', 'Techtud', 'After', 'English']

# Function to construct binary variables
def construct_binary_variables(row):
    binary_vars = []
    for string in strings_to_check:
        if string in row.values:
            binary_vars.append(1)
        else:
            binary_vars.append(0)
    return binary_vars

# model = SentenceTransformer("all-mpnet-base-v2")

fasttext.util.download_model('hu', if_exists='ignore')
model = fasttext.load_model('cc.hu.300.bin')

#### Using Facebook's text vectorizer, trained on Hungarian text, I add the semantic vector of each text while also adding the label dummies.

In [7]:
# "C:/Users/imre2/Desktop/Telex scrape/EDA" contains the raw scraped files
folder_path = "C:/Users/imre2/Desktop/Telex scrape/EDA"

# Looping through the downloaded files
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        file_path = os.path.join(folder_path, filename)
        
        # Reading the CSV file
        df = pd.read_csv(file_path)
        
        #################
        # Creating the tag binary variables and attaching them to the dataframe
        binary_df = pd.DataFrame(df.apply(construct_binary_variables, axis=1).tolist(), columns=strings_to_check)
        df = pd.concat([df, binary_df], axis=1)
        
        # Creating the 300 new variables
        na_columns = pd.DataFrame(np.nan, index=df.index, columns=[f'new_var_{i+1}' for i in range(300)])
        df = pd.concat([df, na_columns], axis=1)

        # Dropping those observations for which text is empty (somehow it happened when text is structured in bullet points)
        df = df.dropna(subset=['texts'])

        # Creating the text vectors and attaching them to the datafarme
        for row_index in df.index:
            text_temp = df['texts'][row_index]
            word_vectors = [model.get_word_vector(word) for word in text_temp.split()]
            document_vector = sum(word_vectors) / len(word_vectors)
            df.loc[row_index, [f'new_var_{i+1}' for i in range(300)]] = document_vector
        
        # Dropping empty observations (in some article structures like bullet points only, I got no text)
        df = df.dropna(subset=['texts'])
        #################
        
        # Saving the updated dataframe
        df.to_csv(file_path, index=False)
        
        # Easy check
        print(file_path)

C:/Users/imre2/Desktop/Telex scrape/EDA\telex_after_101_110.csv
C:/Users/imre2/Desktop/Telex scrape/EDA\telex_after_111_120.csv
C:/Users/imre2/Desktop/Telex scrape/EDA\telex_after_11_20.csv
C:/Users/imre2/Desktop/Telex scrape/EDA\telex_after_121_130.csv
C:/Users/imre2/Desktop/Telex scrape/EDA\telex_after_131_140.csv
C:/Users/imre2/Desktop/Telex scrape/EDA\telex_after_141_150.csv
C:/Users/imre2/Desktop/Telex scrape/EDA\telex_after_151_160.csv
C:/Users/imre2/Desktop/Telex scrape/EDA\telex_after_161_170.csv
C:/Users/imre2/Desktop/Telex scrape/EDA\telex_after_1_10.csv
C:/Users/imre2/Desktop/Telex scrape/EDA\telex_after_21_30.csv
C:/Users/imre2/Desktop/Telex scrape/EDA\telex_after_31_40.csv
C:/Users/imre2/Desktop/Telex scrape/EDA\telex_after_41_50.csv
C:/Users/imre2/Desktop/Telex scrape/EDA\telex_after_51_60.csv
C:/Users/imre2/Desktop/Telex scrape/EDA\telex_after_61_70.csv
C:/Users/imre2/Desktop/Telex scrape/EDA\telex_after_71_80.csv
C:/Users/imre2/Desktop/Telex scrape/EDA\telex_after_81_90

# Merging data

#### I merge the previously transformed files into a single dataframe and save it.

In [8]:
# "C:/Users/imre2/Desktop/Telex scrape/EDA" contains the transformed dataframes.
folder_path = "C:/Users/imre2/Desktop/Telex scrape/EDA"

# List to hold DataFrames
df_list = []

# Iterate through the files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        file_path = os.path.join(folder_path, filename)
        
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        
        # Append the DataFrame to the list
        df_list.append(df)

# Concatenate all DataFrames in the list into a single DataFrame
merged_df = pd.concat(df_list, ignore_index=True)

merged_df = merged_df.drop_duplicates()

# merged_df.to_csv('C:/Users/imre2/Desktop/merged_df.csv', index=False)

In [None]:
merged_df[:5]

Unnamed: 0,link,titles,author_primary,author_secondary,author_tertary,author_quaternary,tag_primary,tag_secondary,tag_tertary,tag_quaternary,...,new_var_291,new_var_292,new_var_293,new_var_294,new_var_295,new_var_296,new_var_297,new_var_298,new_var_299,new_var_300
0,https://telex.hu/after/2024/04/17/metallica-ja...,A Metallica frontembere beletetováltatta a köz...,Flachner Balázs,,,,After,,,,...,0.005884,-0.016356,0.007988,-0.002342,-0.014909,-0.007685,0.012723,-0.006079,0.004763,-0.010547
1,https://telex.hu/after/2024/04/17/szaz-ev-maga...,"Sorozat készül a Száz év magányból, íme az els...",Aradi Hanga Zsófia,,,,After,,,,...,0.004624,-0.016638,0.000857,-0.026097,-0.007956,-0.008374,0.012485,-0.003489,0.009972,-0.007612
2,https://telex.hu/after/2024/04/17/the-jesus-an...,A The Jesus and Mary Chain lesz a Fekete Zaj i...,Antal Bálint,,,,After,,,,...,-0.003449,-0.014265,-0.00097,-0.020453,-0.007297,-0.008354,0.014147,-0.009913,0.008008,-0.015921
3,https://telex.hu/after/2024/04/17/civil-war-po...,"Ez itt Amerika, ahol mindenki lő mindenkire",Klág Dávid,,,,After,,,,...,0.00337,-0.013774,0.011666,-0.014706,-0.003868,-0.001861,0.010073,-0.006801,0.005774,-0.001692
4,https://telex.hu/after/2024/04/17/solo-levelin...,A Solo Leveling egy videójátékozó kamasz leghő...,Flachner Balázs,,,,After,,,,...,-0.002676,-0.016609,0.00702,-0.013395,-0.013368,-0.002592,0.015698,-0.005223,0.011538,-0.005716


# Clustering

#### Using the transformend, merged dataframe, I carry out clustering, using DBSCAN and HDBSCAN. These clustering algorithms utilize the semantic vectors (the 300 variables) to establish a 300-dimension space and carry out the clustering accordingly.

In [3]:
# You should use your own file path

merged_df = pd.read_csv('C:/Users/imre2/Desktop/merged_df.csv')

#merged_df = merged_df[~merged_df['texts'].str.contains('class=', na=False)]
#merged_df = merged_df[~merged_df['texts'].str.contains('https://', na=False)]

#### DBSCAN

In [4]:
dbscan = DBSCAN(eps=1, min_samples=100)
db_clusters = dbscan.fit_predict(merged_df.iloc[:, 11:])

# print(clusters.max())
print(db_clusters.max())

10


In [5]:
merged_df["db_cluster"] = db_clusters
merged_df["db_cluster"].value_counts()

 2     11033
 1      6023
 10     2178
 7      1853
 0      1636
 9      1475
 6       898
 3       524
-1       390
 5       245
 4       178
 8       168
Name: db_cluster, dtype: int64

### HDSBSCAN

In [6]:
hdbscan = HDBSCAN(min_cluster_size=100,min_samples=15)
hdb_clusters = hdbscan.fit_predict(merged_df.iloc[:, 11:])
hdb_clusters.max()

11

In [7]:
merged_df["hdb_cluster"] = hdb_clusters
merged_df["hdb_cluster"].value_counts()

 8     11033
 7      6023
 1      2178
 5      1853
 10     1642
 0      1475
 4       898
 9       524
 3       245
 11      202
-1       182
 2       178
 6       168
Name: hdb_cluster, dtype: int64

# Vizualization

#### Using the results of the clustering procedures, I show that the articles have well-separable semantic vectors. This is supposed to point out that based on the semantic content of the articles, distinct groups can be formed and labelled accordingly.

In [8]:
umap_fit = UMAP(random_state=0, transform_seed=0, n_jobs=1, n_neighbors=10, min_dist= 0.1)
umap_data = umap_fit.fit_transform(merged_df.iloc[:, 11:])
umap_data = pd.DataFrame(umap_data,columns= ["dim1","dim2"])

In [9]:
umap_data[-10:]

Unnamed: 0,dim1,dim2
26591,-0.394644,10.87682
26592,-0.306833,10.350965
26593,0.634955,11.080138
26594,0.464329,11.021667
26595,0.835587,10.611279
26596,-0.321341,10.732587
26597,-0.430516,10.047922
26598,0.597278,10.947031
26599,0.600982,10.856202
26600,0.640174,11.036839


In [10]:
umap_data["db_cluster"] = merged_df["db_cluster"]
umap_data["hdb_cluster"] = merged_df["hdb_cluster"]

##

In [11]:
alt.renderers.enable("mimetype")
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [12]:
db_cluster_plot = alt.Chart(umap_data).mark_circle(size=10).encode(
    x="dim1", y="dim2", color="db_cluster:N"
).properties(
    width=600, height=500, title="UMAP Transformed vectors coloured by dbscan cluster labels"
)

In [13]:
hdb_cluster_plot = alt.Chart(umap_data).mark_circle(size=10).encode(
    x="dim1", y="dim2", color="hdb_cluster:N"
).properties(
    width=600, height=500, title="UMAP Transformed vectors coloured by hdbscan cluster labels"
)

In [32]:
combined_plot = alt.hconcat(db_cluster_plot,hdb_cluster_plot).configure_title(
    fontSize=16,
    anchor="middle",
).configure_legend(
    strokeColor="white",
    fillColor="#AAAAAA",
    padding=10,
    cornerRadius=10,
    labelFontSize=14,
    titleFontSize=14,
).configure_axis(
    titleFontSize=14, labelFontSize=12
)

combined_plot

<VegaLite 5 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/display_frontends.html#troubleshooting


In [23]:
pip install -U altair_viewer

Note: you may need to restart the kernel to use updated packages.




In [33]:
import altair_saver as alt_saver

In [34]:
alt_saver.save(combined_plot, 'combined_plot.png')

NoMatchingVersions: No matches for version='5.17.0' among ['4.0.2', '4.8.1', '4.17.0'].
Often this can be fixed by updating altair_viewer:
    pip install -U altair_viewer

In [142]:
merged_df['texts'][712]
umap_data['dim1'][712], umap_data['dim2'][712]


(-5.2061944, -8.898701)

In [143]:
filtered_df = umap_data[(umap_data['dim1'] >= 4) & (umap_data['dim1'] <= 6) & (umap_data['dim2'] >= 7) & (umap_data['dim2'] <= 10)]

# Step 3: Report the index numbers of the filtered observations
indices = filtered_df.index.tolist()

In [144]:
merged_df['texts'][1701]

'Az elmúlt hetekben valósággal záporoztak a geotermikus bejelentések. A kormány központi geotermikus stratégiát hirdetett, amiről meg is tudtunk néhány részletet. A Mol Nyrt. is hangsúlyozta új stratégiájában, hogy geotermikus irányba nyit. Mint a cég fogalmazott, a hagyományos szénhidrogén-termelésen túl karbonsemleges projektekkel erősít, a meglevő kompetenciákra építve geotermikus kutatásba kezd. A Másfélfok think tank pedig egy szakmai fórumot szervezett, ahol azt is megismerhettük, hogy Mádlné Szőnyi Judit hidrogeológus, az MTA doktora, a téma neves hazai kutatója milyen lehetőségeket lát az iparág előtt.<h2>Vágyak és valóság</h2>A geotermikus energia hazai helyzetéről Vigh Péter, a Másfélfok alapítója használt egy találó képet a szervezet beszélgetésén:„bárki is jut a hazai energetika csúcsára, mindenki azzal kezdi, hogy geotermikus nagyhatalom leszünk, majd végül úgy távozik a posztjáról, hogy ebből semmi nem valósult meg.”Sajnos ez tényleg így szokott történni, pedig az adottsá

In [39]:
filtered_df = umap_data[(umap_data['dim1'] >= 14)]
indices = filtered_df.index.tolist()
indices
index = 120
print(merged_df['author_primary'][indices[index:index+10]])
print(merged_df['tag_primary'][indices[index:index+10]])
print(merged_df['tag_secondary'][indices[index:index+10]])
print(merged_df['tag_tertary'][indices[index:index+10]])
print(merged_df['tag_quaternary'][indices[index:index+10]])

15106          Fehér János
15379     Brückner Gergely
15693            Pál Tamás
15752          Fehér János
16049        Mizsur András
16204        Előd Fruzsina
16257            Pál Tamás
16491     Brückner Gergely
16496    Bakró-Nagy Ferenc
16528    Bakró-Nagy Ferenc
Name: author_primary, dtype: object
15106       Sport
15379       Sport
15693       Sport
15752       Sport
16049    Gazdaság
16204       Sport
16257       Sport
16491    Gazdaság
16496    Gazdaság
16528       Sport
Name: tag_primary, dtype: object
15106    Gazdaság
15379    Gazdaság
15693    Gazdaság
15752    Gazdaság
16049       Sport
16204    Gazdaság
16257    Gazdaság
16491       Sport
16496       Sport
16528    Gazdaság
Name: tag_secondary, dtype: object
15106   NaN
15379   NaN
15693   NaN
15752   NaN
16049   NaN
16204   NaN
16257   NaN
16491   NaN
16496   NaN
16528   NaN
Name: tag_tertary, dtype: float64
15106   NaN
15379   NaN
15693   NaN
15752   NaN
16049   NaN
16204   NaN
16257   NaN
16491   NaN
16496   NaN
1652