In [None]:
import pandas as pd
import os 
desktop_path = os.path.join(os.path.expanduser("~"), "llm-justification-evaluation", "Data_cleaning_cosine_calculation_semantic_and_analysis")
os.chdir(desktop_path)

In [3]:
essay_evaluation_data = pd.read_csv("NLP_analysis/essay_evaluation_analysis.csv")
essay_evaluation_data.rename(columns={"essay": "evaluation"}, inplace=True)
essay_evaluation_data = essay_evaluation_data[["evaluation", "deepseek-r1:1.5b_reasoning", "deepseek-r1:14b_reasoning",
                                                    "qwen2.5:1.5b_reasoning", "qwen2.5:14b_reasoning"]]
essay_evaluation_data

Unnamed: 0,evaluation,deepseek-r1:1.5b_reasoning,deepseek-r1:14b_reasoning,qwen2.5:1.5b_reasoning,qwen2.5:14b_reasoning
0,"In the modern society, people’s work clothes h...",```markdown\n## Task Achievement \nThe essay ...,## Task Achievement\n- The candidate presents ...,## Task Achievement\n\nThe candidate successfu...,## Task Achievement\n- The candidate addresses...
1,Robotic technology is relatively new in our li...,**Feedback and Additional Comments**\n\n**Stre...,## Task Achievement\n- The candidate addresses...,## Task Achievement\n\nThe candidate addresses...,## Task Achievement\n- The candidate partially...
2,Advertisements take up more and more space in ...,**Evaluation Report**\n\n- **Task Achievement:...,## Task Achievement\n- The candidate presents ...,## Task Achievement\n\nThe candidate addresses...,## Task Achievement\n- The candidate addresses...
3,"Currently, there are raging debates about whet...",**Task Achievement**: \nThe essay effectively...,## Task Achievement\n- **Strengths:** The essa...,## Task Achievement\n\n- **Evaluation:** The c...,## Task Achievement\n- The candidate addresses...
4,"After completing school life, all teenagers st...",**Evaluation of the Essay:**\n\n- **Task Achie...,## Task Achievement\n- The candidate addresses...,## Task Achievement\n\nThe candidate covers bo...,## Task Achievement\n- The candidate discusses...
...,...,...,...,...,...
495,"Currently, more and more people are moving tow...",```markdown\n## Task Achievement \n- The essa...,## Task Achievement\n- The candidate addresses...,### Task Achievement\n\nThe candidate addresse...,## Task Achievement\n- The candidate addresses...
496,As we can see that people are fond of the stor...,```markdown\n## Task Achievement\n\n- The essa...,## Task Achievement\n- The candidate addresses...,## Task Achievement\n\nThe candidate addresses...,## Task Achievement\n- The candidate addresses...
497,While it is considered by some people that get...,**Task Achievement** \nThe essay successfully...,## Task Achievement\nThe candidate addresses b...,## Task Achievement\n\n- The candidate address...,## Task Achievement\n- The candidate adequatel...
498,Single-sex education is an aproach that is gai...,The essay effectively addresses all aspects of...,## Task Achievement\n- The candidate addresses...,## Task Achievement\n\nThe candidate successfu...,## Task Achievement\n- The essay broadly addre...


TFIDF

In [4]:
corpus = essay_evaluation_data["evaluation"].tolist() + \
         essay_evaluation_data["deepseek-r1:1.5b_reasoning"].tolist() + \
         essay_evaluation_data["deepseek-r1:14b_reasoning"].tolist() + \
         essay_evaluation_data["qwen2.5:1.5b_reasoning"].tolist() + \
         essay_evaluation_data["qwen2.5:14b_reasoning"].tolist()

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
## Analyze only words, min_df <- words appears at least in 1% of the documents,
## max_df <- words appears in at most 75% of the documents, sublinear_tf <- use sublinear term frequency scaling 1 + log(tf)
vectorizer = TfidfVectorizer(analyzer='word', min_df=0.01, max_df=0.75, ngram_range=(1, 2), stop_words='english', sublinear_tf=True)
X = vectorizer.fit_transform(corpus)

In [6]:
n = len(essay_evaluation_data)
Solution = X[0:n]
deepseek_r1_1_5b = X[n:2*n]
deepseek_r1_14b = X[2*n:3*n]
qwen2_5_1_5b = X[3*n:4*n]
qwen2_5_14b = X[4*n:5*n]

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

deepseek_r1_1_5b_similarity = cosine_similarity(Solution, deepseek_r1_1_5b)
deepseek_r1_14b_similarity = cosine_similarity(Solution, deepseek_r1_14b)   
qwen2_5_1_5b_similarity = cosine_similarity(Solution, qwen2_5_1_5b)
qwen2_5_14b_similarity = cosine_similarity(Solution, qwen2_5_14b)

In [8]:
## The result is a matrix where the cell (i,j) contains the cosine similarity between the i-th solution and the j-th reasoning.
## But I want only the cosine similarity between the i-th solution and the i-th reasoning.
deepseek_r1_1_5b_similarity = deepseek_r1_1_5b_similarity.diagonal()
deepseek_r1_14b_similarity = deepseek_r1_14b_similarity.diagonal()
qwen2_5_1_5b_similarity = qwen2_5_1_5b_similarity.diagonal()
qwen2_5_14b_similarity = qwen2_5_14b_similarity.diagonal()

In [9]:
mean_deepseek_r1_1_5b_similarity = deepseek_r1_1_5b_similarity.mean()
mean_deepseek_r1_14b_similarity = deepseek_r1_14b_similarity.mean()
mean_qwen2_5_1_5b_similarity = qwen2_5_1_5b_similarity.mean()
mean_qwen2_5_14b_similarity = qwen2_5_14b_similarity.mean()

In [10]:
print("DeepSeek R1 1.5B Similarity:")
print(deepseek_r1_1_5b_similarity.mean(axis=0))
print("DeepSeek R1 14B Similarity:")
print(deepseek_r1_14b_similarity.mean(axis=0))
print("Qwen2.5 1.5B Similarity:")
print(qwen2_5_1_5b_similarity.mean(axis=0))
print("Qwen2.5 14B Similarity:")
print(qwen2_5_14b_similarity.mean(axis=0))

DeepSeek R1 1.5B Similarity:
0.1816210752921698
DeepSeek R1 14B Similarity:
0.17547410338860242
Qwen2.5 1.5B Similarity:
0.2500555101332525
Qwen2.5 14B Similarity:
0.25314506202571957


In [11]:
import numpy as np


stats = {
    "Model": [
        "DeepSeek R1 1.5B",
        "DeepSeek R1 14B",
        "Qwen2.5 1.5B",
        "Qwen2.5 14B"
    ],
    "Mean": [
        np.mean(deepseek_r1_1_5b_similarity),
        np.mean(deepseek_r1_14b_similarity),
        np.mean(qwen2_5_1_5b_similarity),
        np.mean(qwen2_5_14b_similarity)
    ],
    "Median": [
        np.median(deepseek_r1_1_5b_similarity),
        np.median(deepseek_r1_14b_similarity),
        np.median(qwen2_5_1_5b_similarity),
        np.median(qwen2_5_14b_similarity)
    ],
    "Std": [
        np.std(deepseek_r1_1_5b_similarity),
        np.std(deepseek_r1_14b_similarity),
        np.std(qwen2_5_1_5b_similarity),
        np.std(qwen2_5_14b_similarity)
    ]
}

stats_df = pd.DataFrame(stats)
print(stats_df)

              Model      Mean    Median       Std
0  DeepSeek R1 1.5B  0.181621  0.170951  0.092287
1   DeepSeek R1 14B  0.175474  0.167578  0.076222
2      Qwen2.5 1.5B  0.250056  0.248039  0.103278
3       Qwen2.5 14B  0.253145  0.251845  0.076693


N-GRAMS OVERLAPPING

In [12]:
### BUILD FUNCTION FOR N-GRAM OVERLAP ###

def ngrams(text, n):
    tokens = text.lower().split() 
    return set([' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)])

def ngram_overlap(text1, text2, n):
    ngrams1 = ngrams(text1, n)
    ngrams2 = ngrams(text2, n)
    
    intersection = len(ngrams1 & ngrams2)
    union = len(ngrams1 | ngrams2)
    
    if union == 0:
        return 0.0
    else:
        return intersection / union

In [13]:
# N-gram overlap with n=3 (can change with n=2, n=1)
n = 3

essay_evaluation_data['deepseek_r1_1_5b_ngram_overlap'] = essay_evaluation_data.apply(lambda row: ngram_overlap(row['evaluation'], row['deepseek-r1:1.5b_reasoning'], n), axis=1)
essay_evaluation_data['deepseek_r1_14b_ngram_overlap']  = essay_evaluation_data.apply(lambda row: ngram_overlap(row['evaluation'], row['deepseek-r1:14b_reasoning'], n), axis=1)
essay_evaluation_data['qwen2_5_1_5b_ngram_overlap']     = essay_evaluation_data.apply(lambda row: ngram_overlap(row['evaluation'], row['qwen2.5:1.5b_reasoning'], n), axis=1)
essay_evaluation_data['qwen2_5_14b_ngram_overlap']      = essay_evaluation_data.apply(lambda row: ngram_overlap(row['evaluation'], row['qwen2.5:14b_reasoning'], n), axis=1)

In [14]:
mean_deepseek_r1_1_5b_ngram = essay_evaluation_data['deepseek_r1_1_5b_ngram_overlap'].mean()
mean_deepseek_r1_14b_ngram  = essay_evaluation_data['deepseek_r1_14b_ngram_overlap'].mean()
mean_qwen2_5_1_5b_ngram     = essay_evaluation_data['qwen2_5_1_5b_ngram_overlap'].mean()
mean_qwen2_5_14b_ngram      = essay_evaluation_data['qwen2_5_14b_ngram_overlap'].mean()

In [15]:
print("Mean N-gram overlap (DeepSeek 1.5B):", mean_deepseek_r1_1_5b_ngram)
print("Mean N-gram overlap (DeepSeek 14B):", mean_deepseek_r1_14b_ngram)
print("Mean N-gram overlap (Qwen 1.5B):", mean_qwen2_5_1_5b_ngram)
print("Mean N-gram overlap (Qwen 14B):", mean_qwen2_5_14b_ngram)

Mean N-gram overlap (DeepSeek 1.5B): 0.004106528452132686
Mean N-gram overlap (DeepSeek 14B): 0.011819587859802052
Mean N-gram overlap (Qwen 1.5B): 0.023385773738184047
Mean N-gram overlap (Qwen 14B): 0.02652261858918125


METRICHE/SCORE

In [16]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import single_meteor_score
from rouge_score import rouge_scorer
import sacrebleu

In [17]:
### BLEU SCORE ###

#smoothing function in order to prevent zero division errors
smoothie = SmoothingFunction().method4

def compute_bleu(reference_text, candidate_text):
    reference = [reference_text.lower().split()]
    candidate = candidate_text.lower().split()
    return sentence_bleu(reference, candidate, smoothing_function=smoothie)

In [18]:
### ROUGE-L ###

rouge_scorer_obj = rouge_scorer.RougeScorer(['rougeL'], use_stemmer = True)

def compute_rouge(reference_text, candidate_text):
    scores = rouge_scorer_obj.score(reference_text, candidate_text)
    #F1 measure of ROUGE-L
    return scores['rougeL'].fmeasure 

In [19]:
### METEOR SCORE ###

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

def compute_meteor(reference_text, candidate_text):
    return single_meteor_score(reference_text.lower().split(), candidate_text.lower().split())

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\filip\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\filip\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [20]:
# BLEU
essay_evaluation_data['deepseek_r1_1_5b_bleu'] = essay_evaluation_data.apply(lambda row: compute_bleu(row['evaluation'], row['deepseek-r1:1.5b_reasoning']), axis=1)
essay_evaluation_data['deepseek_r1_14b_bleu']  = essay_evaluation_data.apply(lambda row: compute_bleu(row['evaluation'], row['deepseek-r1:14b_reasoning']), axis=1)
essay_evaluation_data['qwen2_5_1_5b_bleu']     = essay_evaluation_data.apply(lambda row: compute_bleu(row['evaluation'], row['qwen2.5:1.5b_reasoning']), axis=1)
essay_evaluation_data['qwen2_5_14b_bleu']      = essay_evaluation_data.apply(lambda row: compute_bleu(row['evaluation'], row['qwen2.5:14b_reasoning']), axis=1)

In [21]:
essay_evaluation_data['deepseek_r1_1_5b_rouge'] = essay_evaluation_data.apply(lambda row: compute_rouge(row['evaluation'], row['deepseek-r1:1.5b_reasoning']), axis=1)
essay_evaluation_data['deepseek_r1_14b_rouge']  = essay_evaluation_data.apply(lambda row: compute_rouge(row['evaluation'], row['deepseek-r1:14b_reasoning']), axis=1)
essay_evaluation_data['qwen2_5_1_5b_rouge']     = essay_evaluation_data.apply(lambda row: compute_rouge(row['evaluation'], row['qwen2.5:1.5b_reasoning']), axis=1)
essay_evaluation_data['qwen2_5_14b_rouge']      = essay_evaluation_data.apply(lambda row: compute_rouge(row['evaluation'], row['qwen2.5:14b_reasoning']), axis=1)

In [22]:
essay_evaluation_data['deepseek_r1_1_5b_meteor'] = essay_evaluation_data.apply(lambda row: compute_meteor(row['evaluation'], row['deepseek-r1:1.5b_reasoning']), axis=1)
essay_evaluation_data['deepseek_r1_14b_meteor']  = essay_evaluation_data.apply(lambda row: compute_meteor(row['evaluation'], row['deepseek-r1:14b_reasoning']), axis=1)
essay_evaluation_data['qwen2_5_1_5b_meteor']     = essay_evaluation_data.apply(lambda row: compute_meteor(row['evaluation'], row['qwen2.5:1.5b_reasoning']), axis=1)
essay_evaluation_data['qwen2_5_14b_meteor']      = essay_evaluation_data.apply(lambda row: compute_meteor(row['evaluation'], row['qwen2.5:14b_reasoning']), axis=1)

In [23]:
print("Mean BLEU (DeepSeek 1.5B):", essay_evaluation_data['deepseek_r1_1_5b_bleu'].mean())
print("Mean BLEU (DeepSeek 14B):", essay_evaluation_data['deepseek_r1_14b_bleu'].mean())
print("Mean BLEU (Qwen 1.5B):", essay_evaluation_data['qwen2_5_1_5b_bleu'].mean())
print("Mean BLEU (Qwen 14B):", essay_evaluation_data['qwen2_5_14b_bleu'].mean())

Mean BLEU (DeepSeek 1.5B): 0.015933583409443414
Mean BLEU (DeepSeek 14B): 0.034129205373130064
Mean BLEU (Qwen 1.5B): 0.05320882775950697
Mean BLEU (Qwen 14B): 0.06209584941862815


In [24]:
print("Mean ROUGE-L DeepSeek_1.5:", essay_evaluation_data['deepseek_r1_1_5b_rouge'].mean())
print("Mean ROUGE-L DeepSeek_14B:", essay_evaluation_data['deepseek_r1_14b_rouge'].mean())
print("Mean ROUGE-L Qwen2.5_1.5B:", essay_evaluation_data['qwen2_5_1_5b_rouge'].mean())
print("Mean ROUGE-L Qwen2.5_14B:", essay_evaluation_data['qwen2_5_14b_rouge'].mean())

Mean ROUGE-L DeepSeek_1.5: 0.13674999925644266
Mean ROUGE-L DeepSeek_14B: 0.14175184120565326
Mean ROUGE-L Qwen2.5_1.5B: 0.16177874740158588
Mean ROUGE-L Qwen2.5_14B: 0.16552745435071858


In [25]:
print("Mean METEOR (DeepSeek 1.5B):", essay_evaluation_data['deepseek_r1_1_5b_meteor'].mean())
print("Mean METEOR (DeepSeek 14B):", essay_evaluation_data['deepseek_r1_14b_meteor'].mean())
print("Mean METEOR (Qwen 1.5B):", essay_evaluation_data['qwen2_5_1_5b_meteor'].mean())
print("Mean METEOR (Qwen 14B):", essay_evaluation_data['qwen2_5_14b_meteor'].mean())

Mean METEOR (DeepSeek 1.5B): 0.1535712812729544
Mean METEOR (DeepSeek 14B): 0.1774721050959949
Mean METEOR (Qwen 1.5B): 0.24090656224433413
Mean METEOR (Qwen 14B): 0.2297538118764793


In [26]:
metrics = {
    "Cosine Similarity": [
        deepseek_r1_1_5b_similarity.mean(),
        deepseek_r1_14b_similarity.mean(),
        qwen2_5_1_5b_similarity.mean(),
        qwen2_5_14b_similarity.mean()
    ],
    "N-gram Overlap": [
        essay_evaluation_data['deepseek_r1_1_5b_ngram_overlap'].mean(),
        essay_evaluation_data['deepseek_r1_14b_ngram_overlap'].mean(),
        essay_evaluation_data['qwen2_5_1_5b_ngram_overlap'].mean(),
        essay_evaluation_data['qwen2_5_14b_ngram_overlap'].mean()
    ],
    "BLEU": [
        essay_evaluation_data['deepseek_r1_1_5b_bleu'].mean(),
        essay_evaluation_data['deepseek_r1_14b_bleu'].mean(),
        essay_evaluation_data['qwen2_5_1_5b_bleu'].mean(),
        essay_evaluation_data['qwen2_5_14b_bleu'].mean()
    ],
    "ROUGE-L": [
        essay_evaluation_data['deepseek_r1_1_5b_rouge'].mean(),
        essay_evaluation_data['deepseek_r1_14b_rouge'].mean(),
        essay_evaluation_data['qwen2_5_1_5b_rouge'].mean(),
        essay_evaluation_data['qwen2_5_14b_rouge'].mean()
    ],
    "METEOR": [
        essay_evaluation_data['deepseek_r1_1_5b_meteor'].mean(),
        essay_evaluation_data['deepseek_r1_14b_meteor'].mean(),
        essay_evaluation_data['qwen2_5_1_5b_meteor'].mean(),
        essay_evaluation_data['qwen2_5_14b_meteor'].mean()
    ]
}

models = ["DeepSeek R1 1.5B", "DeepSeek R1 14B", "Qwen2.5 1.5B", "Qwen2.5 14B"]
metrics_df = pd.DataFrame(metrics, index=models)
print(metrics_df)

                  Cosine Similarity  N-gram Overlap      BLEU   ROUGE-L  \
DeepSeek R1 1.5B           0.181621        0.004107  0.015934  0.136750   
DeepSeek R1 14B            0.175474        0.011820  0.034129  0.141752   
Qwen2.5 1.5B               0.250056        0.023386  0.053209  0.161779   
Qwen2.5 14B                0.253145        0.026523  0.062096  0.165527   

                    METEOR  
DeepSeek R1 1.5B  0.153571  
DeepSeek R1 14B   0.177472  
Qwen2.5 1.5B      0.240907  
Qwen2.5 14B       0.229754  
