In [1]:
import json
import pandas as pd
import numpy as np
import statistics
import os
import matplotlib.pyplot as plt
import seaborn as sns

# BACK

In [29]:
def compare_dict_with_df(dictionary, df, key_column, value_column):
    """
    Сравнивает значения из словаря со значениями в датафрейме.
    
    Parameters:
    dictionary (dict): Словарь для сравнения
    df (pandas.DataFrame): Датафрейм для сравнения
    key_column (str): Имя столбца с ключами в датафрейме
    value_column (str): Имя столбца со значениями для сравнения
    
    Returns:
    dict: Словарь с результатами сравнения
    """
    results = {
        'matching': [],
        'different': [],
        'not_found_in_df': [],
        'not_found_in_dict': []
    }
    
    # Создаем множества ключей для быстрого поиска
    dict_keys = set(dictionary.keys())
    df_keys = set(df[key_column])
    
    # Проверяем каждый ключ из словаря
    for key in dict_keys:
        if key in df_keys:
            df_value = df[df[key_column] == key][value_column].iloc[0]
            dict_value = dictionary[key]
            
            if df_value == dict_value:
                results['matching'].append(key)
            else:
                results['different'].append({
                    'key': key,
                    'dict_value': dict_value,
                    'df_value': df_value
                })
        else:
            results['not_found_in_df'].append(key)
    
    # Находим ключи, которые есть в датафрейме, но нет в словаре
    results['not_found_in_dict'] = list(df_keys - dict_keys)
    
    return results

In [23]:
with open ("Alexey_results/coldstart_dump_Q2_K.json", "r", encoding="utf-8") as f:
    f1 = json.load(f)

In [26]:
print(f1.keys(), "\n")
print(len(f1['top_20_keywords']['Probability']))

# f1['Qwen/Qwen2.5-Coder-1.5B']['top_20_keywords']['Probability']

dict_keys(['total_prob_keywords', 'total_prob_special_tokens', 'avg_prob_keywords', 'avg_prob_special_tokens', 'top_20_keywords', 'top_20_special_tokens', 'total_prob_natural_language', 'indentation_probabilities']) 

276


In [27]:
# print(f1["Qwen/Qwen2.5-Coder-1.5B"].keys(), "\n")
# print(len(f1['Qwen/Qwen2.5-Coder-1.5B']['top_20_keywords']['Probability']))

# # f1['Qwen/Qwen2.5-Coder-1.5B']['top_20_keywords']['Probability']

In [22]:
table1 = pd.read_csv("Alexey_results/cold_start_probabilities_keywords_Q2_K.csv")

table1

Unnamed: 0.1,Unnamed: 0,Probability
0,restrict,3.397835e-07
1,and_eq,0.000000e+00
2,event,2.143958e-06
3,loop,3.120154e-06
4,not_eq,0.000000e+00
...,...,...
271,match,1.973994e-06
272,as,9.921478e-06
273,_Alignof,0.000000e+00
274,unsized,0.000000e+00


In [30]:
t2 = compare_dict_with_df(f1['top_20_keywords']['Probability'], table1, "Unnamed: 0", "Probability")

# Aggregate csv files

In [69]:
def merge_probability_csvs(directory_path: str, keywords: bool) -> pd.DataFrame:
    """
    Объединяет CSV файлы с вероятностями слов в единый датафрейм.
    
    Parameters:
    directory_path (str): Путь к директории с CSV файлами
    
    Returns:
    pandas.DataFrame: Объединенный датафрейм со словами и вероятностями из всех файлов
    """
    dfs = []
    
    for filename in os.listdir(directory_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(directory_path, filename)
            df = pd.read_csv(file_path)
            filename_splitted = filename[:-4].strip().split("_")
            if keywords:
                filename_to_add = '_'.join(filename_splitted[4:])
            else:
                filename_to_add = '_'.join(filename_splitted[5:])
            
            df = df.rename(columns={
                'Unnamed: 0': 'word',
                'Probability': f'Probability_{filename_to_add}'
            })
            
            dfs.append(df)
    
    result = dfs[0][['word']].copy()
    
    for df in dfs:
        prob_column = [col for col in df.columns if col.startswith('Probability_')][0]
        result = result.merge(
            df[['word', prob_column]], 
            on='word', 
            how='outer'
        )
    
    # result = result.fillna('Missed')
    
    return result

In [70]:
keywords_path = "Alexey_results/keywords"
spec_tokens_path = "Alexey_results/special_tokens"

In [71]:
agg_table_kw = merge_probability_csvs(keywords_path, keywords=True)

agg_table_kw

Unnamed: 0,word,Probability_Q4_K_M,Probability_Q3_K_M,Probability_Q8_0,Probability_Q5_K_S,Probability_Q3_K_L,Probability_Q4_1,Probability_Q5_0,Probability_Q6_K,Probability_Q5_K_M,Probability_Q4_K_S,Probability_Q5_1,Probability_Q4_0,Probability_Q3_K_S,Probability_Q2_K
0,restrict,3.662968e-07,3.882846e-07,3.178558e-07,4.199332e-07,3.987753e-07,2.771521e-07,3.646345e-07,3.215081e-07,3.399913e-07,4.517719e-07,3.039383e-07,3.075358e-07,2.591902e-07,3.397835e-07
1,and_eq,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
2,event,1.650353e-06,1.799586e-06,1.443143e-06,2.065618e-06,2.049010e-06,1.391842e-06,1.655910e-06,1.685397e-06,1.651487e-06,1.772026e-06,1.661246e-06,1.551659e-06,9.705177e-07,2.143958e-06
3,loop,2.032908e-06,2.366073e-06,1.963170e-06,2.198948e-06,2.017869e-06,2.452053e-06,2.033295e-06,2.166746e-06,2.237513e-06,2.292872e-06,2.192680e-06,1.484075e-06,1.235796e-06,3.120154e-06
4,not_eq,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
271,match,1.653692e-06,1.878893e-06,1.442134e-06,1.526098e-06,1.441359e-06,1.399984e-06,1.508741e-06,1.346698e-06,1.392937e-06,2.291796e-06,1.310143e-06,1.652693e-06,1.036851e-06,1.973994e-06
272,as,7.377701e-06,4.258249e-06,6.326363e-06,7.030476e-06,6.538212e-06,6.520302e-06,9.555511e-06,7.175287e-06,6.603195e-06,5.219177e-06,7.469726e-06,4.742152e-06,7.636422e-06,9.921478e-06
273,_Alignof,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
274,unsized,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00


In [72]:
agg_table_st = merge_probability_csvs(spec_tokens_path, keywords=False)

agg_table_st

Unnamed: 0,word,Probability_Q4_K_S,Probability_Q3_K_M,Probability_Q4_1,Probability_Q3_K_L,Probability_Q4_0,Probability_Q8_0,Probability_Q5_K_M,Probability_Q2_K,Probability_Q5_0,Probability_Q3_K_S,Probability_Q6_K,Probability_Q5_1,Probability_Q5_K_S,Probability_Q4_K_M
0,-webpack,2.599572e-08,3.278047e-08,1.981244e-08,2.666512e-08,1.833117e-08,2.013024e-08,2.075888e-08,1.764061e-08,2.495615e-08,8.006577e-09,2.162730e-08,2.187573e-08,2.045439e-08,2.092959e-08
1,(grammar,7.408705e-10,4.317356e-10,5.467580e-10,4.991672e-10,9.182849e-10,5.314209e-10,5.392609e-10,1.815152e-09,5.741210e-10,2.056540e-10,5.931368e-10,5.521066e-10,6.821107e-10,6.697374e-10
2,._Ċ,1.456061e-07,1.565988e-07,1.498381e-07,1.358484e-07,9.289172e-08,9.942131e-08,9.647282e-08,1.415225e-07,1.362116e-07,9.890429e-08,1.122818e-07,1.288574e-07,1.154350e-07,1.288312e-07
3,.slides,2.263071e-08,2.959786e-08,1.614887e-08,1.899562e-08,1.182144e-08,1.214499e-08,1.185342e-08,1.668852e-08,1.328897e-08,1.209590e-08,1.296099e-08,1.488036e-08,1.274108e-08,1.480743e-08
4,>N,1.139762e-07,9.207321e-08,1.002480e-07,1.008208e-07,9.854616e-08,1.074605e-07,1.055509e-07,7.710585e-08,1.014930e-07,5.298809e-08,1.028685e-07,1.261580e-07,1.193871e-07,1.202939e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20177,.Invalid,1.025477e-08,1.254630e-08,1.253611e-08,1.082999e-08,7.210578e-09,7.920841e-09,8.665265e-09,7.470971e-09,8.883094e-09,7.585913e-09,8.605520e-09,8.502976e-09,9.026393e-09,1.068877e-08
20178,${,1.387774e-06,1.280509e-06,1.217895e-06,1.083910e-06,1.316193e-06,1.361338e-06,1.397680e-06,2.152901e-06,1.421070e-06,1.239700e-06,1.511090e-06,1.297331e-06,1.412287e-06,1.317665e-06
20179,-worker,2.627938e-08,2.124446e-08,1.676017e-08,2.001423e-08,2.851736e-08,1.885026e-08,1.870603e-08,7.241764e-09,2.071465e-08,6.934033e-09,2.070666e-08,2.299437e-08,2.001298e-08,2.339997e-08
20180,.Class,4.722031e-08,5.853724e-08,3.306519e-08,5.104942e-08,3.256079e-08,3.384266e-08,3.314436e-08,3.291224e-08,3.957415e-08,3.866077e-08,3.716590e-08,3.974105e-08,3.409798e-08,4.937180e-08


In [23]:
agg_table_kw.to_csv("Alexey_results/agg_keywords.csv", index=False)
agg_table_st.to_csv("Alexey_results/agg_spec_tokens.csv", index=False)

# Check results

In [51]:
agg_table_kw = pd.read_csv("Alexey_results/agg_keywords.csv")
agg_table_st= pd.read_csv("Alexey_results/agg_spec_tokens.csv")

## Check NaNs

In [52]:
agg_table_kw.isna().sum()

word                  1
Probability_Q2_K      0
Probability_Q3_K_L    0
Probability_Q3_K_M    0
Probability_Q3_K_S    0
Probability_Q4_0      0
Probability_Q4_1      0
Probability_Q4_K_M    0
Probability_Q4_K_S    0
Probability_Q5_0      0
Probability_Q5_1      0
Probability_Q5_K_M    0
Probability_Q5_K_S    0
Probability_Q6_K      0
Probability_Q8_0      0
dtype: int64

In [53]:
agg_table_kw[agg_table_kw["word"].isna()]

Unnamed: 0,word,Probability_Q2_K,Probability_Q3_K_L,Probability_Q3_K_M,Probability_Q3_K_S,Probability_Q4_0,Probability_Q4_1,Probability_Q4_K_M,Probability_Q4_K_S,Probability_Q5_0,Probability_Q5_1,Probability_Q5_K_M,Probability_Q5_K_S,Probability_Q6_K,Probability_Q8_0
263,,4e-06,4e-06,4e-06,2e-06,4e-06,4e-06,5e-06,5e-06,4e-06,5e-06,5e-06,5e-06,5e-06,5e-06


In [54]:
agg_table_st.isna().sum()

word                  0
Probability_Q2_K      0
Probability_Q3_K_L    0
Probability_Q3_K_M    0
Probability_Q3_K_S    0
Probability_Q4_0      0
Probability_Q4_1      0
Probability_Q4_K_M    0
Probability_Q4_K_S    0
Probability_Q5_0      0
Probability_Q5_1      0
Probability_Q5_K_M    0
Probability_Q5_K_S    0
Probability_Q6_K      0
Probability_Q8_0      0
dtype: int64

We don't have missed values

## Try to visualize

In [55]:
def create_probability_visualizations(df, output_dir='plots', n_top_words=30, figsize=(15, 10), dpi=300):
    """
    Создает и сохраняет визуализации для вероятностей слов
    
    Parameters:
    df (pandas.DataFrame): Датафрейм со словами и вероятностями
    output_dir (str): Директория для сохранения графиков
    n_top_words (int): Количество топ слов для отображения
    figsize (tuple): Размер графика
    dpi (int): Разрешение графиков
    """
    import os
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    prob_columns = [col for col in df.columns if col.startswith('Probability_')]
    # Создаем словарь для преобразования имен колонок
    column_names = {col: col.replace('Probability_', '') for col in prob_columns}
    
    # 1. Тепловая карта для топ слов
    def plot_heatmap(df, prob_columns, n_top_words):
        word_means = df[prob_columns].replace('Missed', np.nan).astype(float).mean(axis=1)
        top_words = df.iloc[word_means.nlargest(n_top_words).index]
        
        plt.figure(figsize=figsize)
        data = top_words[prob_columns].replace('Missed', np.nan).astype(float)
        
        # Используем сокращенные имена колонок
        data.columns = [column_names[col] for col in data.columns]
        
        sns.heatmap(data, 
                   yticklabels=top_words['word'],
                   cmap='YlOrRd',
                   center=0.5,
                   vmin=0,
                   vmax=1,
                   cbar_kws={'label': 'Probability'})
        plt.title(f'Probability Heatmap for Top {n_top_words} Words')
        plt.xlabel('Models')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.savefig(f'{output_dir}/heatmap.png', dpi=dpi, bbox_inches='tight')
        plt.close()
    
    def plot_boxplot(df, prob_columns):
        plt.figure(figsize=figsize)
        data_melted = df.melt(id_vars=['word'], 
                             value_vars=prob_columns,
                             var_name='Model',
                             value_name='Probability')
        data_melted['Probability'] = pd.to_numeric(data_melted['Probability']
                                                 .replace('Missed', np.nan))
        
        data_melted['Model'] = data_melted['Model'].map(column_names)
        
        sns.boxplot(x='Model', y='Probability', data=data_melted)
        
        # Для каждой модели находим два слова с наибольшей вероятностью и добавляем их на график
        for i, model in enumerate(data_melted['Model'].unique()):
            model_data = data_melted[data_melted['Model'] == model].sort_values('Probability', ascending=False)
            top_2_words = model_data.head(2)
            
            for _, row in top_2_words.iterrows():
                plt.plot(i, row['Probability'], 'ro')
                plt.annotate(row['word'], 
                            xy=(i, row['Probability']),
                            xytext=(5, 5),
                            textcoords='offset points',
                            fontsize=8,
                            color='red')
        
        plt.title('Distribution of Probabilities Across Models')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.savefig(f'{output_dir}/boxplot.png', dpi=dpi, bbox_inches='tight')
        plt.close()
    
    def plot_model_comparison(df, prob_columns):
        if len(prob_columns) >= 2:
            plt.figure(figsize=figsize)
            data = df[prob_columns].replace('Missed', np.nan).astype(float)
            plt.scatter(data[prob_columns[0]], 
                       data[prob_columns[1]], 
                       alpha=0.5)
            
            plt.xlabel(column_names[prob_columns[0]])
            plt.ylabel(column_names[prob_columns[1]])
            plt.title('Model Comparison')
            
            max_val = max(data[prob_columns[0]].max(), data[prob_columns[1]].max())
            min_val = min(data[prob_columns[0]].min(), data[prob_columns[1]].min())
            plt.plot([min_val, max_val], [min_val, max_val], 'r--', alpha=0.5)
            
            plt.tight_layout()
            plt.savefig(f'{output_dir}/scatter.png', dpi=dpi, bbox_inches='tight')
            plt.close()
    
    # 4. Distribution plot
    def plot_distributions(df, prob_columns):
        plt.figure(figsize=figsize)
        data = df[prob_columns].replace('Missed', np.nan).astype(float)
        
        for col in prob_columns:
            sns.kdeplot(data=data[col], label=column_names[col])
            
        plt.title('Probability Distributions Across Models')
        plt.xlabel('Probability')
        plt.ylabel('Density')
        plt.xlim(-0.01, 0.01)
        # plt.ylim(0.01, 600)
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        plt.savefig(f'{output_dir}/distributions.png', dpi=dpi, bbox_inches='tight')
        plt.close()
    
    plot_heatmap(df, prob_columns, n_top_words)
    plot_boxplot(df, prob_columns)
    plot_model_comparison(df, prob_columns)
    plot_distributions(df, prob_columns)
    
    print("\nBasic Statistics:")
    stats = df[prob_columns].replace('Missed', np.nan).astype(float).describe()
    stats.columns = [column_names[col] for col in stats.columns]
    print(stats)
    
    stats.to_csv(f'{output_dir}/statistics.csv')

In [67]:
create_probability_visualizations(agg_table_kw, output_dir="plots/plots_keywords", n_top_words=30)


Basic Statistics:
             Q2_K      Q3_K_L      Q3_K_M        Q3_K_S        Q4_0  \
count  276.000000  276.000000  276.000000  2.760000e+02  276.000000   
mean     0.000327    0.000404    0.000434  3.431290e-04    0.000316   
std      0.002197    0.002938    0.003240  2.436581e-03    0.002152   
min      0.000000    0.000000    0.000000  0.000000e+00    0.000000   
25%      0.000000    0.000000    0.000000  0.000000e+00    0.000000   
50%      0.000001    0.000001    0.000001  6.801837e-07    0.000001   
75%      0.000013    0.000014    0.000014  9.018675e-06    0.000012   
max      0.030746    0.040127    0.046755  3.180285e-02    0.025148   

             Q4_1      Q4_K_M      Q4_K_S        Q5_0        Q5_1      Q5_K_M  \
count  276.000000  276.000000  276.000000  276.000000  276.000000  276.000000   
mean     0.000357    0.000387    0.000446    0.000392    0.000402    0.000396   
std      0.002525    0.002902    0.003575    0.002874    0.002938    0.002878   
min      0.000000

In [68]:
create_probability_visualizations(agg_table_st, output_dir="plots/plots_stop_tokens", n_top_words=30)


Basic Statistics:
               Q2_K        Q3_K_L        Q3_K_M        Q3_K_S          Q4_0  \
count  2.018200e+04  2.018200e+04  2.018200e+04  2.018200e+04  2.018200e+04   
mean   2.104736e-05  1.629611e-05  1.644915e-05  2.054649e-05  1.661217e-05   
std    1.782696e-03  1.260406e-03  1.281238e-03  1.712086e-03  1.245230e-03   
min    0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   
25%    2.982300e-09  3.654316e-09  3.659387e-09  1.672228e-09  3.280728e-09   
50%    1.433972e-08  1.844745e-08  1.853910e-08  9.227183e-09  1.578297e-08   
75%    6.167204e-08  7.702205e-08  7.899803e-08  4.131010e-08  6.518106e-08   
max    2.349492e-01  1.705114e-01  1.736258e-01  2.286116e-01  1.664810e-01   

               Q4_1        Q4_K_M        Q4_K_S          Q5_0          Q5_1  \
count  2.018200e+04  2.018200e+04  2.018200e+04  2.018200e+04  2.018200e+04   
mean   1.398006e-05  1.436849e-05  1.389497e-05  1.402424e-05  1.420572e-05   
std    1.056182e-03  1.009722e-0

In [None]:
def visualize_threshold_counts(agg_table_kw, agg_table_st, output_dir='plots', figsize=(12, 6), dpi=300):
    """
    Визуализирует количество ключевых слов и стоп-токенов, необходимых для достижения заданных порогов.

    Args:
        agg_table_kw: DataFrame с данными по ключевым словам.
        agg_table_st: DataFrame с данными по стоп-токенам.
        output_dir: Директория для сохранения графиков.
        figsize: Размер графика.
        dpi: Разрешение графиков.
    """

    import os
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    thresholds = [0.80, 0.85, 0.90, 0.95]
    results = {}

    for column in agg_table_kw.columns:
        if column == "word":
            continue

        kw_sorted = agg_table_kw[column].sort_values(ascending=False)
        kw_total = kw_sorted.sum()
        kw_cumsum = kw_sorted.cumsum()

        st_sorted = agg_table_st[column].sort_values(ascending=False)
        st_total = st_sorted.sum()
        st_cumsum = st_sorted.cumsum()

        results[column] = {}
        for threshold in thresholds:
            kw_count = len(kw_cumsum[kw_cumsum <= kw_total * threshold])
            st_count = len(st_cumsum[st_cumsum <= st_total * threshold])
            results[column][threshold] = (kw_count, st_count)

    # Преобразование данных для визуализации
    plot_data = []
    for model, threshold_data in results.items():
        model_splitted = model.strip().split("_")
        model_short_name = "_".join(model_splitted[1:])
        for threshold, counts in threshold_data.items():
            plot_data.append({'Model': model_short_name, 'Threshold': threshold, 'Type': 'Keywords', 'Count': counts[0]})
            plot_data.append({'Model': model_short_name, 'Threshold': threshold, 'Type': 'Stop Tokens', 'Count': counts[1]})

    plot_df = pd.DataFrame(plot_data)

    plt.figure(figsize=figsize)
    sns.lineplot(x='Model', y='Count', hue='Threshold', style='Type', data=plot_df, marker='o', palette="viridis")
    plt.title('Number of Words Needed for Coverage Thresholds')
    plt.ylabel('Number of Words')
    plt.xlabel('Model')
    plt.xticks(rotation=45, ha='right')
    plt.legend(title='Threshold & Type', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(axis='y')
    plt.tight_layout()
    plt.savefig(f'{output_dir}/threshold_counts.png', dpi=dpi, bbox_inches='tight')
    plt.close()

In [69]:
visualize_threshold_counts(agg_table_kw, agg_table_st, output_dir="plots/plots_statistics")

# Count statistics

In [58]:
print("Keywords:")

for column in agg_table_kw.columns:
    if column == "word":
        pass
    else:
        sum = agg_table_kw[column].sum()
        print(f"Sum of {column}: {sum}")

Keywords:
Sum of Probability_Q2_K: 0.09021144585886505
Sum of Probability_Q3_K_L: 0.11147905651749923
Sum of Probability_Q3_K_M: 0.1198266802672091
Sum of Probability_Q3_K_S: 0.09470361686405984
Sum of Probability_Q4_0: 0.0873039841653881
Sum of Probability_Q4_1: 0.09847236640733203
Sum of Probability_Q4_K_M: 0.10679272272376181
Sum of Probability_Q4_K_S: 0.12311314908567689
Sum of Probability_Q5_0: 0.10809954765687871
Sum of Probability_Q5_1: 0.11083425194076622
Sum of Probability_Q5_K_M: 0.10934886521056461
Sum of Probability_Q5_K_S: 0.1119898426579443
Sum of Probability_Q6_K: 0.10828486133242153
Sum of Probability_Q8_0: 0.10699541246200711


In [59]:
print("Stop tokens:")
for column in agg_table_st.columns:
    if column == "word":
        pass
    else:
        sum = agg_table_st[column].sum()
        print(f"Sum of {column}: {sum}")

Stop tokens:
Sum of Probability_Q2_K: 0.4247777722735203
Sum of Probability_Q3_K_L: 0.32888808839478767
Sum of Probability_Q3_K_M: 0.3319767789731372
Sum of Probability_Q3_K_S: 0.4146691692374943
Sum of Probability_Q4_0: 0.33526685640710074
Sum of Probability_Q4_1: 0.28214559009182205
Sum of Probability_Q4_K_M: 0.28998482613477405
Sum of Probability_Q4_K_S: 0.280428265159965
Sum of Probability_Q5_0: 0.28303720449892195
Sum of Probability_Q5_1: 0.286699830710974
Sum of Probability_Q5_K_M: 0.2928718374962185
Sum of Probability_Q5_K_S: 0.2928412809979311
Sum of Probability_Q6_K: 0.3011303321616742
Sum of Probability_Q8_0: 0.3025210360431168


In [60]:
print("\nDistribution analysis for keywords:")
for column in agg_table_kw.columns:
    if column == "word":
        continue

    sorted_values = agg_table_kw[column].sort_values(ascending=False)
    total_sum = sorted_values.sum()
    
    cumsum = sorted_values.cumsum()
    
    thresholds = [0.80, 0.85, 0.90, 0.95]
    word_counts = {}
    
    for threshold in thresholds:
        word_count = len(cumsum[cumsum <= total_sum * threshold])
        word_counts[threshold] = word_count
    
    print(f"\n{column}:")
    for threshold, count in word_counts.items():
        print(f"{threshold*100}% of probability is contained in top {count} words")
    
    print(f"\nTop 10 words for {column}:")
    top_10 = agg_table_kw.nlargest(10, column)[['word', column]]
    for _, row in top_10.iterrows():
        print(f"{row['word']}: {row[column]:.6f}")


Distribution analysis for keywords:

Probability_Q2_K:
80.0% of probability is contained in top 6 words
85.0% of probability is contained in top 8 words
90.0% of probability is contained in top 11 words
95.0% of probability is contained in top 16 words

Top 10 words for Probability_Q2_K:
import: 0.030746
package: 0.013987
def: 0.008795
class: 0.007184
const: 0.006778
from: 0.003797
public: 0.003417
void: 0.001954
let: 0.001713
private: 0.001347

Probability_Q3_K_L:
80.0% of probability is contained in top 6 words
85.0% of probability is contained in top 8 words
90.0% of probability is contained in top 10 words
95.0% of probability is contained in top 15 words

Top 10 words for Probability_Q3_K_L:
import: 0.040127
package: 0.025250
from: 0.006150
class: 0.005842
def: 0.005638
public: 0.004870
const: 0.003912
void: 0.002675
var: 0.002258
using: 0.002015

Probability_Q3_K_M:
80.0% of probability is contained in top 6 words
85.0% of probability is contained in top 7 words
90.0% of probabi

In [61]:
print("\nDistribution analysis for stop tokens:")
for column in agg_table_st.columns:
    if column == "word":
        continue

    sorted_values = agg_table_st[column].sort_values(ascending=False)
    total_sum = sorted_values.sum()
    
    cumsum = sorted_values.cumsum()
    
    thresholds = [0.80, 0.85, 0.90, 0.95]
    word_counts = {}
    
    for threshold in thresholds:
        word_count = len(cumsum[cumsum <= total_sum * threshold])
        word_counts[threshold] = word_count
    
    print(f"\n{column}:")
    for threshold, count in word_counts.items():
        print(f"{threshold*100}% of probability is contained in top {count} words")
    
    print(f"\nTop 10 words for {column}:")
    top_10 = agg_table_st.nlargest(10, column)[['word', column]]
    for _, row in top_10.iterrows():
        print(f"{row['word']}: {row[column]:.6f}")


Distribution analysis for stop tokens:

Probability_Q2_K:
80.0% of probability is contained in top 3 words
85.0% of probability is contained in top 5 words
90.0% of probability is contained in top 9 words
95.0% of probability is contained in top 19 words

Top 10 words for Probability_Q2_K:
**: 0.234949
#: 0.091549
//: 0.013195
<: 0.010633
#include: 0.008356
/*Ċ: 0.007338
/*: 0.006158
""": 0.004892
##: 0.004844
#!/: 0.004211

Probability_Q3_K_L:
80.0% of probability is contained in top 6 words
85.0% of probability is contained in top 8 words
90.0% of probability is contained in top 12 words
95.0% of probability is contained in top 27 words

Top 10 words for Probability_Q3_K_L:
**: 0.170511
#: 0.047560
//: 0.015621
/*Ċ: 0.011158
#include: 0.009042
#!/: 0.008062
/**Ċ: 0.006672
/*: 0.006418
<: 0.006261
<?: 0.005300

Probability_Q3_K_M:
80.0% of probability is contained in top 5 words
85.0% of probability is contained in top 8 words
90.0% of probability is contained in top 12 words
95.0% o

In [62]:
print("\nComparative analysis of word counts needed for thresholds.")
print("\t\t\tFormat:\nthreshold: keywords count / stop tokens count")

for column in agg_table_kw.columns:
    if column == "word":
        continue
        
    kw_sorted = agg_table_kw[column].sort_values(ascending=False)
    kw_total = kw_sorted.sum()
    kw_cumsum = kw_sorted.cumsum()
    
    st_sorted = agg_table_st[column].sort_values(ascending=False)
    st_total = st_sorted.sum()
    st_cumsum = st_sorted.cumsum()
    
    thresholds = [0.80, 0.85, 0.90, 0.95]
    print(f"\n{column}:")
    
    for threshold in thresholds:
        kw_count = len(kw_cumsum[kw_cumsum <= kw_total * threshold])
        st_count = len(st_cumsum[st_cumsum <= st_total * threshold])
        print(f"{threshold*100}%: {kw_count} / {st_count}")


Comparative analysis of word counts needed for thresholds.
			Format:
threshold: keywords count / stop tokens count

Probability_Q2_K:
80.0%: 6 / 3
85.0%: 8 / 5
90.0%: 11 / 9
95.0%: 16 / 19

Probability_Q3_K_L:
80.0%: 6 / 6
85.0%: 8 / 8
90.0%: 10 / 12
95.0%: 15 / 27

Probability_Q3_K_M:
80.0%: 6 / 5
85.0%: 7 / 8
90.0%: 10 / 12
95.0%: 15 / 27

Probability_Q3_K_S:
80.0%: 6 / 3
85.0%: 7 / 5
90.0%: 10 / 8
95.0%: 14 / 15

Probability_Q4_0:
80.0%: 6 / 6
85.0%: 8 / 8
90.0%: 11 / 12
95.0%: 16 / 26

Probability_Q4_1:
80.0%: 6 / 6
85.0%: 7 / 8
90.0%: 10 / 13
95.0%: 15 / 28

Probability_Q4_K_M:
80.0%: 6 / 6
85.0%: 7 / 9
90.0%: 10 / 13
95.0%: 15 / 29

Probability_Q4_K_S:
80.0%: 5 / 6
85.0%: 7 / 9
90.0%: 9 / 13
95.0%: 15 / 31

Probability_Q5_0:
80.0%: 6 / 7
85.0%: 7 / 9
90.0%: 10 / 13
95.0%: 15 / 31

Probability_Q5_1:
80.0%: 6 / 7
85.0%: 7 / 9
90.0%: 10 / 14
95.0%: 15 / 31

Probability_Q5_K_M:
80.0%: 6 / 6
85.0%: 7 / 9
90.0%: 10 / 13
95.0%: 15 / 28

Probability_Q5_K_S:
80.0%: 6 / 6
85.0%: 7 / 9
90