In [None]:
import pandas as pd
import os 
desktop_path = os.path.join(os.path.expanduser("~"), "llm-justification-evaluation", "Data_cleaning_cosine_calculation_semantic_and_analysis")
os.chdir(desktop_path)

In [2]:
proofs_data = pd.read_csv('NLP_analysis/proofs_analysis.csv')


In [3]:
corpus = proofs_data["solution"].tolist() + \
         proofs_data["deepseek-r1:1.5b_reasoning"].tolist() + \
         proofs_data["deepseek-r1:14b_reasoning"].tolist() + \
         proofs_data["qwen2.5:1.5b_reasoning"].tolist() + \
         proofs_data["qwen2.5:14b_reasoning"].tolist()

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
## Analyze only words, min_df <- words appears at least in 1% of the documents,
## max_df <- words appears in at most 75% of the documents, sublinear_tf <- use sublinear term frequency scaling 1 + log(tf)
vectorizer = TfidfVectorizer(analyzer='word', min_df=0.01, max_df=0.75, ngram_range=(1, 2), stop_words='english', sublinear_tf=True)
X = vectorizer.fit_transform(corpus)

In [5]:
n = len(proofs_data)
Solution = X[0:n]
deepseek_r1_1_5b = X[n:2*n]
deepseek_r1_14b = X[2*n:3*n]
qwen2_5_1_5b = X[3*n:4*n]
qwen2_5_14b = X[4*n:5*n]

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

deepseek_r1_1_5b_similarity = cosine_similarity(Solution, deepseek_r1_1_5b)
deepseek_r1_14b_similarity = cosine_similarity(Solution, deepseek_r1_14b)   
qwen2_5_1_5b_similarity = cosine_similarity(Solution, qwen2_5_1_5b)
qwen2_5_14b_similarity = cosine_similarity(Solution, qwen2_5_14b)

In [7]:
## The result is a matrix where the cell (i,j) contains the cosine similarity between the i-th solution and the j-th reasoning.
## But I want only the cosine similarity between the i-th solution and the i-th reasoning.
deepseek_r1_1_5b_similarity = deepseek_r1_1_5b_similarity.diagonal()
deepseek_r1_14b_similarity = deepseek_r1_14b_similarity.diagonal()
qwen2_5_1_5b_similarity = qwen2_5_1_5b_similarity.diagonal()
qwen2_5_14b_similarity = qwen2_5_14b_similarity.diagonal()

In [8]:
mean_deepseek_r1_1_5b_similarity = deepseek_r1_1_5b_similarity.mean()
mean_deepseek_r1_14b_similarity = deepseek_r1_14b_similarity.mean()
mean_qwen2_5_1_5b_similarity = qwen2_5_1_5b_similarity.mean()
mean_qwen2_5_14b_similarity = qwen2_5_14b_similarity.mean()

In [9]:
print("DeepSeek R1 1.5B Similarity:")
print(deepseek_r1_1_5b_similarity.mean(axis=0))
print("DeepSeek R1 14B Similarity:")
print(deepseek_r1_14b_similarity.mean(axis=0))
print("Qwen2.5 1.5B Similarity:")
print(qwen2_5_1_5b_similarity.mean(axis=0))
print("Qwen2.5 14B Similarity:")
print(qwen2_5_14b_similarity.mean(axis=0))

DeepSeek R1 1.5B Similarity:
0.14954747901631155
DeepSeek R1 14B Similarity:
0.26308490496110004
Qwen2.5 1.5B Similarity:
0.3137835634779472
Qwen2.5 14B Similarity:
0.3437737172755133


In [10]:
import numpy as np


stats = {
    "Model": [
        "DeepSeek R1 1.5B",
        "DeepSeek R1 14B",
        "Qwen2.5 1.5B",
        "Qwen2.5 14B"
    ],
    "Mean": [
        np.mean(deepseek_r1_1_5b_similarity),
        np.mean(deepseek_r1_14b_similarity),
        np.mean(qwen2_5_1_5b_similarity),
        np.mean(qwen2_5_14b_similarity)
    ],
    "Median": [
        np.median(deepseek_r1_1_5b_similarity),
        np.median(deepseek_r1_14b_similarity),
        np.median(qwen2_5_1_5b_similarity),
        np.median(qwen2_5_14b_similarity)
    ],
    "Std": [
        np.std(deepseek_r1_1_5b_similarity),
        np.std(deepseek_r1_14b_similarity),
        np.std(qwen2_5_1_5b_similarity),
        np.std(qwen2_5_14b_similarity)
    ]
}

stats_df = pd.DataFrame(stats)
print(stats_df)

              Model      Mean    Median       Std
0  DeepSeek R1 1.5B  0.149547  0.102391  0.150340
1   DeepSeek R1 14B  0.263085  0.244341  0.185292
2      Qwen2.5 1.5B  0.313784  0.301753  0.160970
3       Qwen2.5 14B  0.343774  0.324102  0.169459


N-GRAMS OVERLAP

In [11]:
### BUILD FUNCTION FOR N-GRAM OVERLAP ###

def ngrams(text, n):
    tokens = text.lower().split()
    return set([' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)])

def ngram_overlap(text1, text2, n):
    ngrams1 = ngrams(text1, n)
    ngrams2 = ngrams(text2, n)
    
    intersection = len(ngrams1 & ngrams2)
    union = len(ngrams1 | ngrams2)
    
    if union == 0:
        return 0.0
    else:
        return intersection / union

In [12]:
# N-gram overlap with n=3 (can change with n=2, n=1)
n = 3

proofs_data['deepseek_r1_1_5b_ngram_overlap'] = proofs_data.apply(lambda row: ngram_overlap(row['solution'], row['deepseek-r1:1.5b_reasoning'], n), axis=1)
proofs_data['deepseek_r1_14b_ngram_overlap']  = proofs_data.apply(lambda row: ngram_overlap(row['solution'], row['deepseek-r1:14b_reasoning'], n), axis=1)
proofs_data['qwen2_5_1_5b_ngram_overlap']     = proofs_data.apply(lambda row: ngram_overlap(row['solution'], row['qwen2.5:1.5b_reasoning'], n), axis=1)
proofs_data['qwen2_5_14b_ngram_overlap']      = proofs_data.apply(lambda row: ngram_overlap(row['solution'], row['qwen2.5:14b_reasoning'], n), axis=1)

In [13]:
mean_deepseek_r1_1_5b_ngram = proofs_data['deepseek_r1_1_5b_ngram_overlap'].mean()
mean_deepseek_r1_14b_ngram  = proofs_data['deepseek_r1_14b_ngram_overlap'].mean()
mean_qwen2_5_1_5b_ngram     = proofs_data['qwen2_5_1_5b_ngram_overlap'].mean()
mean_qwen2_5_14b_ngram      = proofs_data['qwen2_5_14b_ngram_overlap'].mean()

In [14]:
print("Mean N-gram overlap (DeepSeek 1.5B):", mean_deepseek_r1_1_5b_ngram)
print("Mean N-gram overlap (DeepSeek 14B):", mean_deepseek_r1_14b_ngram)
print("Mean N-gram overlap (Qwen 1.5B):", mean_qwen2_5_1_5b_ngram)
print("Mean N-gram overlap (Qwen 14B):", mean_qwen2_5_14b_ngram)

Mean N-gram overlap (DeepSeek 1.5B): 0.00418743923482185
Mean N-gram overlap (DeepSeek 14B): 0.008996559533700156
Mean N-gram overlap (Qwen 1.5B): 0.009809463983717296
Mean N-gram overlap (Qwen 14B): 0.011914660031354668


METRICHE/SCORE

In [15]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import single_meteor_score
from rouge_score import rouge_scorer
import sacrebleu

In [16]:
### BLEU SCORE ###

#smoothing function in order to prevent zero division errors
smoothie = SmoothingFunction().method4

def compute_bleu(reference_text, candidate_text):
    reference = [reference_text.lower().split()]
    candidate = candidate_text.lower().split()
    return sentence_bleu(reference, candidate, smoothing_function=smoothie)

In [17]:
### ROUGE-L ###

rouge_scorer_obj = rouge_scorer.RougeScorer(['rougeL'], use_stemmer = True)

def compute_rouge(reference_text, candidate_text):
    scores = rouge_scorer_obj.score(reference_text, candidate_text)
    #F1 measure of ROUGE-L
    return scores['rougeL'].fmeasure 

In [18]:
### METEOR SCORE ###

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

def compute_meteor(reference_text, candidate_text):
    return single_meteor_score(reference_text.lower().split(), candidate_text.lower().split())

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\filip\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\filip\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [19]:
# BLEU
proofs_data['deepseek_r1_1_5b_bleu'] = proofs_data.apply(lambda row: compute_bleu(row['solution'], row['deepseek-r1:1.5b_reasoning']), axis=1)
proofs_data['deepseek_r1_14b_bleu']  = proofs_data.apply(lambda row: compute_bleu(row['solution'], row['deepseek-r1:14b_reasoning']), axis=1)
proofs_data['qwen2_5_1_5b_bleu']     = proofs_data.apply(lambda row: compute_bleu(row['solution'], row['qwen2.5:1.5b_reasoning']), axis=1)
proofs_data['qwen2_5_14b_bleu']      = proofs_data.apply(lambda row: compute_bleu(row['solution'], row['qwen2.5:14b_reasoning']), axis=1)

In [20]:
# ROUGE-L
proofs_data['deepseek_r1_1_5b_rouge_l'] = proofs_data.apply(lambda row: compute_rouge(row['solution'], row['deepseek-r1:1.5b_reasoning']), axis=1)
proofs_data['deepseek_r1_14b_rouge_l']  = proofs_data.apply(lambda row: compute_rouge(row['solution'], row['deepseek-r1:14b_reasoning']), axis=1)   
proofs_data['qwen2_5_1_5b_rouge_l']     = proofs_data.apply(lambda row: compute_rouge(row['solution'], row['qwen2.5:1.5b_reasoning']), axis=1)
proofs_data['qwen2_5_14b_rouge_l']      = proofs_data.apply(lambda row: compute_rouge(row['solution'], row['qwen2.5:14b_reasoning']), axis=1)

In [21]:
# METEOR
proofs_data['deepseek_r1_1_5b_meteor'] = proofs_data.apply(lambda row: compute_meteor(row['solution'], row['deepseek-r1:1.5b_reasoning']), axis=1)
proofs_data['deepseek_r1_14b_meteor']  = proofs_data.apply(lambda row: compute_meteor(row['solution'], row['deepseek-r1:14b_reasoning']), axis=1)
proofs_data['qwen2_5_1_5b_meteor']     = proofs_data.apply(lambda row: compute_meteor(row['solution'], row['qwen2.5:1.5b_reasoning']), axis=1)
proofs_data['qwen2_5_14b_meteor']      = proofs_data.apply(lambda row: compute_meteor(row['solution'], row['qwen2.5:14b_reasoning']), axis=1)

In [22]:
print("Mean BLEU DeepSeek_1.5:", proofs_data['deepseek_r1_1_5b_bleu'].mean())
print("Mean BLEU DeepSeek_14B:", proofs_data['deepseek_r1_14b_bleu'].mean())
print("Mean BLEU Qwen2.5_1.5:", proofs_data['qwen2_5_1_5b_bleu'].mean())
print("Mean BLEU Qwen2.5_14B:", proofs_data['qwen2_5_14b_bleu'].mean())

Mean BLEU DeepSeek_1.5: 0.013160347028055007
Mean BLEU DeepSeek_14B: 0.02192681210436644
Mean BLEU Qwen2.5_1.5: 0.025534973759111713
Mean BLEU Qwen2.5_14B: 0.030236821426794635


In [23]:
print("Mean ROUGE-L DeepSeek_1.5:", proofs_data['deepseek_r1_1_5b_rouge_l'].mean())
print("Mean ROUGE-L DeepSeek_14B:", proofs_data['deepseek_r1_14b_rouge_l'].mean())
print("Mean ROUGE-L Qwen2.5_1.5:", proofs_data['qwen2_5_1_5b_rouge_l'].mean())
print("Mean ROUGE-L Qwen2.5_14B:", proofs_data['qwen2_5_14b_rouge_l'].mean())

Mean ROUGE-L DeepSeek_1.5: 0.16572394760875064
Mean ROUGE-L DeepSeek_14B: 0.2265877833865113
Mean ROUGE-L Qwen2.5_1.5: 0.24604177001140334
Mean ROUGE-L Qwen2.5_14B: 0.26857285387175184


In [24]:
print("Mean METEOR DeepSeek_1.5:", proofs_data['deepseek_r1_1_5b_meteor'].mean())
print("Mean METEOR DeepSeek_14B:", proofs_data['deepseek_r1_14b_meteor'].mean())
print("Mean METEOR Qwen2.5_1.5:", proofs_data['qwen2_5_1_5b_meteor'].mean())
print("Mean METEOR Qwen2.5_14B:", proofs_data['qwen2_5_14b_meteor'].mean())

Mean METEOR DeepSeek_1.5: 0.10160730757803033
Mean METEOR DeepSeek_14B: 0.1303263314967935
Mean METEOR Qwen2.5_1.5: 0.17410971440386738
Mean METEOR Qwen2.5_14B: 0.18064696154743887


In [25]:
metrics = {
    "Cosine Similarity": [
        deepseek_r1_1_5b_similarity.mean(),
        deepseek_r1_14b_similarity.mean(),
        qwen2_5_1_5b_similarity.mean(),
        qwen2_5_14b_similarity.mean()
    ],
    "N-gram Overlap": [
        proofs_data['deepseek_r1_1_5b_ngram_overlap'].mean(),
        proofs_data['deepseek_r1_14b_ngram_overlap'].mean(),
        proofs_data['qwen2_5_1_5b_ngram_overlap'].mean(),
        proofs_data['qwen2_5_14b_ngram_overlap'].mean()
    ],
    "BLEU": [
        proofs_data['deepseek_r1_1_5b_bleu'].mean(),
        proofs_data['deepseek_r1_14b_bleu'].mean(),
        proofs_data['qwen2_5_1_5b_bleu'].mean(),
        proofs_data['qwen2_5_14b_bleu'].mean()
    ],
    "ROUGE-L": [
        proofs_data['deepseek_r1_1_5b_rouge_l'].mean(),
        proofs_data['deepseek_r1_14b_rouge_l'].mean(),
        proofs_data['qwen2_5_1_5b_rouge_l'].mean(),
        proofs_data['qwen2_5_14b_rouge_l'].mean()
    ],
    "METEOR": [
        proofs_data['deepseek_r1_1_5b_meteor'].mean(),
        proofs_data['deepseek_r1_14b_meteor'].mean(),
        proofs_data['qwen2_5_1_5b_meteor'].mean(),
        proofs_data['qwen2_5_14b_meteor'].mean()
    ]
}

models = ["DeepSeek R1 1.5B", "DeepSeek R1 14B", "Qwen2.5 1.5B", "Qwen2.5 14B"]
metrics_df = pd.DataFrame(metrics, index=models)
print(metrics_df)

                  Cosine Similarity  N-gram Overlap      BLEU   ROUGE-L  \
DeepSeek R1 1.5B           0.149547        0.004187  0.013160  0.165724   
DeepSeek R1 14B            0.263085        0.008997  0.021927  0.226588   
Qwen2.5 1.5B               0.313784        0.009809  0.025535  0.246042   
Qwen2.5 14B                0.343774        0.011915  0.030237  0.268573   

                    METEOR  
DeepSeek R1 1.5B  0.101607  
DeepSeek R1 14B   0.130326  
Qwen2.5 1.5B      0.174110  
Qwen2.5 14B       0.180647  
