In [47]:
import ast
import pandas as pd
import numpy as np

def link_parameters_results(params_file,results_file) :

    ## Loading parameter combinations
    with open(params_file, 'r') as file:
        lines = file.readlines()

    param_id_list = []
    overlap_list = []
    window_length_list = []
    umap_dimension_size_list = []
    umap_n_neighbors_list = []
    partioned_clustering_size_list = []

    for line in lines:
        p = line.strip().split(",")
        param_id_list.append(int(p[0]))
        overlap_list.append(int(p[1]))
        window_length_list.append(int(p[2]))
        umap_dimension_size_list.append(int(p[3]))
        umap_n_neighbors_list.append(int(p[4]))
        partioned_clustering_size_list.append(int(p[5]))

    result_id_list = []
    jaccard_list = []
    puw_list = []
    topic_coherence_list = []

    ## Cluster info lists
    num_clusters_list = []
    number_of_outliers_list = []
    number_of_ones_list = []
    average_probabilities_list = []
    period_cluster_sizes_list = []

    ## Loading corresponding results
    # open file and read lines
    with open(results_file, "r") as file:
        text = file.read()

    # split lines based on double newline
    results = text.strip().split("\n\n")

    for line in results :
        p = line.split("$")
        result_id_list.append(int(p[0]))
        jaccard_list.append(ast.literal_eval(p[1]))
        puw_list.append(ast.literal_eval(p[2]))
        topic_coherence_list.append(ast.literal_eval(p[3]))
        cluster_info = ast.literal_eval(p[4])
        num_clusters_list.append(cluster_info[0])
        number_of_outliers_list.append(cluster_info[1])
        number_of_ones_list.append(cluster_info[2])
        average_probabilities_list.append(cluster_info[3])
        period_cluster_sizes_list.append(cluster_info[4])


    ## Parameters
    overlap = [overlap_list[i] for i in result_id_list]
    window_length = [window_length_list[i] for i in result_id_list]
    umap_dimension_size = [umap_dimension_size_list[i] for i in result_id_list]
    umap_n_neighbors = [umap_n_neighbors_list[i] for i in result_id_list]
    partioned_clustering_size = [partioned_clustering_size_list[i] for i in result_id_list]

    dico = dict()
    dico['ID'] = []
    ## Parameters
    dico['Overlap'] = []
    dico['Window Length'] = []
    dico['UMAP Dim Size'] = []
    dico['UMAP n Neighbors'] = []
    dico['Partitioned Clustering Size'] = []
    dico['Jaccard'] = []
    dico['Coherence'] = []
    dico['Number of timeframes'] = []
    dico['Clusters per timeframe'] = []

    df = pd.DataFrame()
    for i in range(len(result_id_list)) :
        dico['ID'].append(result_id_list[i])
        dico['Overlap'].append(overlap[i])
        dico['Window Length'].append(window_length[i])
        dico['UMAP Dim Size'].append(umap_dimension_size[i])
        dico['UMAP n Neighbors'].append(umap_n_neighbors[i])
        dico['Partitioned Clustering Size'].append(partioned_clustering_size[i])
        dico['Jaccard'].append([round(num, 3) for num in jaccard_list[i]])
        dico['Coherence'].append([round(num, 3) for num in topic_coherence_list[i]])
        dico['Number of timeframes'].append(len(num_clusters_list[i]))
        dico['Clusters per timeframe'].append(num_clusters_list[i])

    #print(data)
    # create list of lists containing only elements corresponding to True in bools
    #filtered_data = [data[i] for i, b in enumerate(bools) if b]

    # create dataframe from filtered data
    #df = pd.DataFrame(filtered_data).T
    return pd.DataFrame(dico)


In [48]:
# Load the files generated by GRIDSEARCHING.ipynb notebook

result = link_parameters_results("pNYT-o2-w3.txt","NYT-o2-w3.txt")
result

Unnamed: 0,ID,Overlap,Window Length,UMAP Dim Size,UMAP n Neighbors,Partitioned Clustering Size,Jaccard,Coherence,Number of timeframes,Clusters per timeframe
0,0,2,3,2,10,10,"[0.999, 0.999, 0.999, 0.999]","[0.383, 0.367, 0.38, 0.374]",5,"[151, 154, 175, 193, 139]"
1,1,2,3,2,10,15,"[0.999, 0.999, 0.999, 0.999]","[0.397, 0.384, 0.386, 0.395]",5,"[103, 104, 112, 117, 4]"
2,2,2,3,2,10,20,"[0.998, 0.999, 1.0, 0.998]","[0.419, 0.421, 0.365, 0.436]",5,"[79, 67, 2, 82, 4]"
3,3,2,3,2,10,50,"[0.996, 1.0, 0.947, 1.0]","[0.484, 0.448, 0.342, 0.414]",5,"[33, 4, 2, 3, 3]"
4,4,2,3,2,10,100,"[1.0, 0.947, 0.996, 1.0]","[0.501, 0.448, 0.505, 0.436]",5,"[2, 2, 14, 2, 2]"
...,...,...,...,...,...,...,...,...,...,...
95,95,2,3,5,100,10,"[0.998, 1.0, 1.0, 1.0]","[0.403, 0.39, 0.391, 0.501]",5,"[128, 2, 3, 2, 121]"
96,96,2,3,5,100,15,"[0.996, 1.0, 1.0, 1.0]","[0.457, 0.483, 0.357, 0.442]",5,"[66, 3, 2, 2, 59]"
97,97,2,3,5,100,20,"[0.997, 0.998, 1.0, 1.0]","[0.443, 0.42, 0.451, 0.47]",5,"[60, 35, 4, 2, 2]"
98,98,2,3,5,100,50,"[0.995, 1.0, 0.998, 1.0]","[0.524, 0.516, 0.472, 0.391]",5,"[25, 5, 24, 2, 2]"


In [49]:
## Average number of topics per period, used in WTQ
arr = np.array(result['Clusters per timeframe'].tolist())
mean_list = np.mean(arr, axis=0)
mean_list

array([63.77, 50.86, 36.8 , 50.47, 39.44])

In [51]:
result[['C1','C2','C3','C4']] = result['Coherence'].apply(lambda x : pd.Series(x))
result[['D1','D2','D3','D4']] = result['Jaccard'].apply(lambda x : pd.Series(x))

result['TQ1'] = result.apply(lambda row: row['D1'] * row['C1'], axis=1)
result['TQ2'] = result.apply(lambda row: row['D2'] * row['C2'], axis=1)
result['TQ3'] = result.apply(lambda row: row['D3'] * row['C3'], axis=1)
result['TQ4'] = result.apply(lambda row: row['D4'] * row['C4'], axis=1)

result[['N1','N2','N3','N4','N5']] = result['Clusters per timeframe'].apply(lambda x : pd.Series(x))

result['WTQ1'] = result.apply(lambda row: row['TQ1'] * (row['N1'] / mean_list[0]) , axis=1)
result['WTQ2'] = result.apply(lambda row: row['TQ2'] * (row['N2'] / mean_list[1]), axis=1)
result['WTQ3'] = result.apply(lambda row: row['TQ3'] * (row['N3'] / mean_list[2]), axis=1)
result['WTQ4'] = result.apply(lambda row: row['TQ4'] * (row['N4'] / mean_list[3]), axis=1)

result = result.drop(columns=['Jaccard','Coherence','C1','C2','C3','C4','D1','D2','D3','D4','Clusters per timeframe','Number of timeframes','Overlap','Window Length', 'N5'])

result

Unnamed: 0,ID,UMAP Dim Size,UMAP n Neighbors,Partitioned Clustering Size,TQ1,TQ2,TQ3,TQ4,N1,N2,N3,N4,WTQ1,WTQ2,WTQ3,WTQ4
0,0,2,10,10,0.382617,0.366633,0.379620,0.373626,151,154,175,193,0.905993,1.110135,1.805258,1.428766
1,1,2,10,15,0.396603,0.383616,0.385614,0.394605,103,104,112,117,0.640585,0.784429,1.173608,0.914777
2,2,2,10,20,0.418162,0.420579,0.365000,0.435128,79,67,2,82,0.518030,0.554046,0.019837,0.706964
3,3,2,10,50,0.482064,0.448000,0.323874,0.414000,33,4,2,3,0.249461,0.035234,0.017602,0.024609
4,4,2,10,100,0.501000,0.424256,0.502980,0.436000,2,2,14,2,0.015713,0.016683,0.191351,0.017278
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,5,100,10,0.402194,0.390000,0.391000,0.501000,128,2,3,2,0.807289,0.015336,0.031875,0.019853
96,96,5,100,15,0.455172,0.483000,0.357000,0.442000,66,3,2,2,0.471089,0.028490,0.019402,0.017515
97,97,5,100,20,0.441671,0.419160,0.451000,0.470000,60,35,4,2,0.415560,0.288451,0.049022,0.018625
98,98,5,100,50,0.521380,0.516000,0.471056,0.391000,25,5,24,2,0.204399,0.050727,0.307210,0.015494


In [53]:
result['mean TQ'] = result[['TQ1', 'TQ2', 'TQ3', 'TQ4']].mean(axis=1)
result['mean WTQ'] = result[['WTQ1','WTQ2','WTQ3','WTQ4']].mean(axis=1)

result

Unnamed: 0,ID,UMAP Dim Size,UMAP n Neighbors,Partitioned Clustering Size,TQ1,TQ2,TQ3,TQ4,N1,N2,N3,N4,WTQ1,WTQ2,WTQ3,WTQ4,mean TQ,mean WTQ
0,0,2,10,10,0.382617,0.366633,0.379620,0.373626,151,154,175,193,0.905993,1.110135,1.805258,1.428766,0.375624,1.312538
1,1,2,10,15,0.396603,0.383616,0.385614,0.394605,103,104,112,117,0.640585,0.784429,1.173608,0.914777,0.390110,0.878350
2,2,2,10,20,0.418162,0.420579,0.365000,0.435128,79,67,2,82,0.518030,0.554046,0.019837,0.706964,0.409717,0.449720
3,3,2,10,50,0.482064,0.448000,0.323874,0.414000,33,4,2,3,0.249461,0.035234,0.017602,0.024609,0.416984,0.081726
4,4,2,10,100,0.501000,0.424256,0.502980,0.436000,2,2,14,2,0.015713,0.016683,0.191351,0.017278,0.466059,0.060256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,5,100,10,0.402194,0.390000,0.391000,0.501000,128,2,3,2,0.807289,0.015336,0.031875,0.019853,0.421049,0.218588
96,96,5,100,15,0.455172,0.483000,0.357000,0.442000,66,3,2,2,0.471089,0.028490,0.019402,0.017515,0.434293,0.134124
97,97,5,100,20,0.441671,0.419160,0.451000,0.470000,60,35,4,2,0.415560,0.288451,0.049022,0.018625,0.445458,0.192914
98,98,5,100,50,0.521380,0.516000,0.471056,0.391000,25,5,24,2,0.204399,0.050727,0.307210,0.015494,0.474859,0.144458


# TABLE 1 : HIGHEST MEAN TQ

In [55]:
result = result.sort_values(by='mean TQ', ascending=False)
table1 = result.head(5)

columns_to_process = ['TQ1', 'TQ2', 'TQ3', 'TQ4', 'WTQ1', 'WTQ2', 'WTQ3', 'WTQ4', 'mean TQ', 'mean WTQ']  # List the columns you want to process

for column in columns_to_process:
    table1[column] = table1[column].round(3)  # Keep only three decimal places
    table1[column] = table1[column].apply(lambda x: f"{x:.3f} ± 0.001")  # Add ±0.001 error margin

# Output the updated dataframe
table1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table1[column] = table1[column].round(3)  # Keep only three decimal places
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table1[column] = table1[column].apply(lambda x: f"{x:.3f} ± 0.001")  # Add ±0.001 error margin


Unnamed: 0,ID,UMAP Dim Size,UMAP n Neighbors,Partitioned Clustering Size,TQ1,TQ2,TQ3,TQ4,N1,N2,N3,N4,WTQ1,WTQ2,WTQ3,WTQ4,mean TQ,mean WTQ
23,23,2,100,50,0.538 ± 0.001,0.551 ± 0.001,0.485 ± 0.001,0.440 ± 0.001,24,19,17,3,0.202 ± 0.001,0.206 ± 0.001,0.224 ± 0.001,0.026 ± 0.001,0.503 ± 0.001,0.165 ± 0.001
19,19,2,50,100,0.538 ± 0.001,0.517 ± 0.001,0.481 ± 0.001,0.463 ± 0.001,18,16,12,5,0.152 ± 0.001,0.163 ± 0.001,0.157 ± 0.001,0.046 ± 0.001,0.500 ± 0.001,0.129 ± 0.001
53,53,4,10,50,0.503 ± 0.001,0.504 ± 0.001,0.468 ± 0.001,0.493 ± 0.001,28,25,33,3,0.221 ± 0.001,0.248 ± 0.001,0.419 ± 0.001,0.029 ± 0.001,0.492 ± 0.001,0.229 ± 0.001
78,78,5,10,50,0.503 ± 0.001,0.504 ± 0.001,0.468 ± 0.001,0.493 ± 0.001,28,25,33,3,0.221 ± 0.001,0.248 ± 0.001,0.419 ± 0.001,0.029 ± 0.001,0.492 ± 0.001,0.229 ± 0.001
28,28,3,10,50,0.503 ± 0.001,0.504 ± 0.001,0.468 ± 0.001,0.493 ± 0.001,28,25,33,3,0.221 ± 0.001,0.248 ± 0.001,0.419 ± 0.001,0.029 ± 0.001,0.492 ± 0.001,0.229 ± 0.001


In [40]:
table1.to_csv('table1.csv')

In [56]:
columns_to_check = ['WTQ1', 'WTQ2', 'WTQ3', 'WTQ4']  # List the columns to check for values

# Create a boolean mask to identify rows that meet the condition
mask = result[columns_to_check] < 0.3

# we Use the 'any' function along the rows axis (axis=1) to check if any WTQ column has a value lower than 0.3 (meaning a very low number of topics found in a certain period)
rows_to_remove = mask.any(axis=1)

# Remove rows that meet the condition
table2 = result.loc[~rows_to_remove]

# Output the filtered dataframe
table2

Unnamed: 0,ID,UMAP Dim Size,UMAP n Neighbors,Partitioned Clustering Size,TQ1,TQ2,TQ3,TQ4,N1,N2,N3,N4,WTQ1,WTQ2,WTQ3,WTQ4,mean TQ,mean WTQ
77,77,5,10,20,0.422577,0.413586,0.388611,0.401598,80,66,82,91,0.530126,0.536702,0.865927,0.724102,0.406593,0.664214
27,27,3,10,20,0.422577,0.413586,0.388611,0.401598,80,66,82,91,0.530126,0.536702,0.865927,0.724102,0.406593,0.664214
52,52,4,10,20,0.422577,0.413586,0.388611,0.401598,80,66,82,91,0.530126,0.536702,0.865927,0.724102,0.406593,0.664214
41,41,3,50,15,0.410178,0.393606,0.394605,0.416166,76,89,74,88,0.488843,0.688772,0.793499,0.725631,0.403639,0.674186
66,66,4,50,15,0.410178,0.393606,0.394605,0.416166,76,89,74,88,0.488843,0.688772,0.793499,0.725631,0.403639,0.674186
91,91,5,50,15,0.410178,0.393606,0.394605,0.416166,76,89,74,88,0.488843,0.688772,0.793499,0.725631,0.403639,0.674186
51,51,4,10,15,0.406593,0.3996,0.390609,0.397602,102,95,117,119,0.650345,0.746402,1.241882,0.93748,0.398601,0.894027
26,26,3,10,15,0.406593,0.3996,0.390609,0.397602,102,95,117,119,0.650345,0.746402,1.241882,0.93748,0.398601,0.894027
76,76,5,10,15,0.406593,0.3996,0.390609,0.397602,102,95,117,119,0.650345,0.746402,1.241882,0.93748,0.398601,0.894027
1,1,2,10,15,0.396603,0.383616,0.385614,0.394605,103,104,112,117,0.640585,0.784429,1.173608,0.914777,0.39011,0.87835


In [57]:
import pandas as pd

# Now we sort the filtered array by mean TQ
table2 = table2.sort_values(by='mean TQ', ascending=False)
table2 = table2.head(5)

# Assuming your dataframe is named 'df'
columns_to_process = ['TQ1', 'TQ2', 'TQ3', 'TQ4', 'WTQ1', 'WTQ2', 'WTQ3', 'WTQ4', 'mean TQ', 'mean WTQ']  # List the columns you want to process

for column in columns_to_process:
    table2[column] = table2[column].round(3)  # Keep only three decimal places
    table2[column] = table2[column].apply(lambda x: f"{x:.3f} ± 0.001")  # Add ±0.001 error margin

# Output the updated dataframe
table2

Unnamed: 0,ID,UMAP Dim Size,UMAP n Neighbors,Partitioned Clustering Size,TQ1,TQ2,TQ3,TQ4,N1,N2,N3,N4,WTQ1,WTQ2,WTQ3,WTQ4,mean TQ,mean WTQ
77,77,5,10,20,0.423 ± 0.001,0.414 ± 0.001,0.389 ± 0.001,0.402 ± 0.001,80,66,82,91,0.530 ± 0.001,0.537 ± 0.001,0.866 ± 0.001,0.724 ± 0.001,0.407 ± 0.001,0.664 ± 0.001
52,52,4,10,20,0.423 ± 0.001,0.414 ± 0.001,0.389 ± 0.001,0.402 ± 0.001,80,66,82,91,0.530 ± 0.001,0.537 ± 0.001,0.866 ± 0.001,0.724 ± 0.001,0.407 ± 0.001,0.664 ± 0.001
27,27,3,10,20,0.423 ± 0.001,0.414 ± 0.001,0.389 ± 0.001,0.402 ± 0.001,80,66,82,91,0.530 ± 0.001,0.537 ± 0.001,0.866 ± 0.001,0.724 ± 0.001,0.407 ± 0.001,0.664 ± 0.001
41,41,3,50,15,0.410 ± 0.001,0.394 ± 0.001,0.395 ± 0.001,0.416 ± 0.001,76,89,74,88,0.489 ± 0.001,0.689 ± 0.001,0.793 ± 0.001,0.726 ± 0.001,0.404 ± 0.001,0.674 ± 0.001
66,66,4,50,15,0.410 ± 0.001,0.394 ± 0.001,0.395 ± 0.001,0.416 ± 0.001,76,89,74,88,0.489 ± 0.001,0.689 ± 0.001,0.793 ± 0.001,0.726 ± 0.001,0.404 ± 0.001,0.674 ± 0.001


In [46]:
table2.to_csv('table2.csv')