In [22]:
import numpy as np
from sklearn.metrics import accuracy_score, log_loss

def classification_task(df):
    """
    Calculate cross-entropy loss and accuracy for a classification task.
    
    Parameters:d
    df (pandas.DataFrame): DataFrame with columns for ground truth class and predicted class.
    
    Returns:
    tuple: Cross-entropy loss, accuracy
    """
    y_true = df.iloc[:, 1].values
    y_pred = df.iloc[:, 2].values
    
    # Calculate cross-entropy loss
    # ce_loss = log_loss(y_true, y_pred)
    
    # Calculate accuracy
    acc = accuracy_score(y_true, np.round(y_pred))
    
    # return ce_loss, acc
    return acc

In [8]:
import numpy as np
from transformers import BertTokenizer, BertModel, pipeline
from nltk.translate.bleu_score import corpus_bleu
from datasets import load_metric
from transformers import LlamaForCausalLM, LlamaTokenizer

def generation_task(df):
    """
    Calculate evaluation metrics for a text generation task.
    
    Parameters:
    df (pandas.DataFrame): DataFrame with columns for original and generated text.
    
    Returns:
    dict: Dictionary containing BERT score, BLEU, GLUE, and perplexity.
    """
    original_texts = df.iloc[:, 0].tolist()
    generated_texts = df.iloc[:, 1].tolist()
    
    # Calculate BERT score
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    bert_scorer = pipeline('text-similarity', model=model, tokenizer=tokenizer)
    bert_score = np.mean([bert_scorer(orig, gen)['similarity'] for orig, gen in zip(original_texts, generated_texts)])
    
    # Calculate BLEU
    bleu = corpus_bleu([[ref] for ref in original_texts], [hyp for hyp in generated_texts])
    
    # Calculate GLUE
    glue_metric = load_metric('glue', 'stsb')
    glue_score = glue_metric.compute(predictions=generated_texts, references=original_texts)['pearson']
    
    # Calculate perplexity using LLaMA
    # llama_tokenizer = LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf")
    # llama_model = LlamaForCausalLM.from_pretrained("decapoda-research/llama-7b-hf")

    # def calculate_perplexity(text):
    #     input_ids = llama_tokenizer.encode(text, return_tensors='pt')
    #     with torch.no_grad():
    #         output = llama_model(input_ids, labels=input_ids)[0]
    #     return torch.exp(output).item()
    
    # perplexity = np.mean([calculate_perplexity(gen) for gen in generated_texts])
    
    return {
        'bert_score': bert_score,
        'bleu': bleu,
        'glue': glue_score,
        # 'perplexity': perplexity
    }

In [9]:
import pandas as pd
from ibm_API import get_response

In [10]:
generation_data_df = pd.read_json("test_gen.json", encoding="utf-8")
generation_data_df["MGT"] = "MGT"

In [11]:
generation_data_df

Unnamed: 0,input,output,MGT
0,\n أنت شاعر فصيح عليم بقواعد العروض وال...,دع الأمانيّ أو رُمهنّ من ظُبةٍ فإنما هنّ من غي...,MGT
1,\n أنت شاعر فصيح عليم بقواعد العروض وال...,لا بل هوَ النورُ أضحى يَـــدِبُّ فـــي الظــلم...,MGT
2,\n أنت شاعر فصيح عليم بقواعد العروض وال...,لما سلكوا بالهجرِ يوماً طريقتي سـيَـنْـدم بُـع...,MGT
3,\n أنت شاعر فصيح عليم بقواعد العروض وال...,ما أَنتَ بِالحَكَمِ الَّذي سُمِّيتَهُ غالَتكَ ...,MGT
4,\n أنت شاعر فصيح عليم بقواعد العروض وال...,إِذا المَدحُ زانَ فَتى مَعشَرٍ فَإِنَّ يَزيدَ ...,MGT
...,...,...,...
195,\n أنت شاعر فصيح عليم بقواعد العروض وال...,أَجْنَى إِلَيْهَا الرِّضَى جَنَانِي مِنَ الْمَ...,MGT
196,\n أنت شاعر فصيح عليم بقواعد العروض وال...,وَلَوى بِــقَــلبــي مُــذ لَوى أَصــداغــه وَ...,MGT
197,\n أنت شاعر فصيح عليم بقواعد العروض وال...,وأَرسلَتِ اللَّحظَ الضَّعيفَ مع الهَوى لِيَقوى...,MGT
198,\n أنت شاعر فصيح عليم بقواعد العروض وال...,ألا فـي سـبـيـل الله فـقـد أخى تقى دعـاه إلى ا...,MGT


In [12]:
for i in range(10):
    generation_data_df.iloc[i, 2] = get_response(generation_data_df.iloc[i, 0])

In [13]:
generation_data_df.iloc[10,0]

'\n        أنت شاعر فصيح عليم بقواعد العروض والقافية، تكتب الشعر ملتزما بها.\n        اكتب أبياتًا بعد هذا البيت، ملتزما بالبحر والقافية باحتراف:\n        \n                                        اكتب أبياتًا بعد هذا البيت: \n                                        إِن صَوَّروكَ فَإِنَّما قَد صَوَّروا تاجَ الفَخارِ وَمَطلَعَ الأَنوارِ\n                                        \n        '

# CLASSIFICATION

In [14]:
meters = ['الخفيف',
 'مجزوء الرمل',
 'البسيط',
 'الكامل',
 'الوافر',
 'الطويل',
 'السريع',
 'المنسرح',
 'مجزوء الكامل',
 'المجتث',
 'الرمل',
 'مجزوء الوافر',
 'المتقارب',
 'مخلع البسيط',
 'مجزوء الرجز',
 'مجزوء الخفيف',
 'الرجز',
 'المديد',
 'الهزج',
 'مجزوء البسيط',
 'منهوك المنسرح',
 'أحذ الكامل',
 'مشطور الرجز',
 'المضارع',
 'المقتضب',
 'مجزوء المتقارب',
 'مجزوء السريع',
 'منهوك الرجز']

## Base Model

## Base Model

In [15]:
classification_data_df = pd.read_json("test_cls_as_trained.json", encoding="utf-8")

In [16]:
# Save inputs to try later on other models:

# classification_data_df['input'].to_list()
with open('test_cls_for_other_LLMs.json', 'w', encoding='utf-8') as txt_file:
    for item in classification_data_df['input'].to_list():
        txt_file.write(f"{item}\n")

In [17]:
classification_data_df['Base Prediction'] = 'Pred'

In [18]:
classification_data_df.head()

Unnamed: 0,input,output,Base Prediction
0,ما هو البحر الشعري لهذا البيت؟ وطـال ليـلى ودم...,البسيط,Pred
1,ما هو البحر الشعري لهذا البيت؟ فَـلا تَـقـطَـع...,الطويل,Pred
2,ما هو البحر الشعري لهذا البيت؟ كـم عـزمـةٍ يُـ...,الكامل,Pred
3,ما هو البحر الشعري لهذا البيت؟ سَلِ الفَريقَ ا...,البسيط,Pred
4,ما هو البحر الشعري لهذا البيت؟ فـــولّى يُـــب...,الطويل,Pred


In [19]:
for i in range(len(classification_data_df)):
    classification_data_df.iloc[i,2] = get_response(classification_data_df.iloc[i,0])

Classifying 200 samples took 1m 11.0s --> 0.355s for each

In [24]:
classification_task(classification_data_df)

TypeError: loop of ufunc does not support argument 0 of type str which has no callable rint method

In [None]:
cls_df_base = classification_data_df.copy()

Sort them to avoid taking the least

In [None]:
meters_sorted = sorted(meters, key=len)

In [None]:
# for res in classification_data_df['Base Prediction'].values:
#     for meter in meters_sorted: 
#         if meter in res:
            

# cls_df_base.loc[cls_df_base['Base Prediction'] in , 'Base Prediction'] = cls_df_base['Base Res'] * 2
changed = []
for idx, res in enumerate(classification_data_df['Base Prediction'].values):
    for meter in meters_sorted:
        if meter in res:
            ch_dict = {'Index': idx, "Before": res, 'After': meter}
            changed.append(ch_dict)
            classification_data_df.at[idx, 'Base Prediction'] = meter
print(changed)
print(len(changed))
print(len(classification_data_df))

In [None]:
classification_data_df['Base Prediction'].values

Accuracy with Empty

In [None]:
matches = classification_data_df['output'] == classification_data_df['Base Prediction']
accuracy = matches.mean() * 100  
print(f"Accuracy: {accuracy:.2f}%")

Accuracy without Empty

In [None]:
cls_df_without_empty = classification_data_df[classification_data_df['Base Prediction'] != ' ']

In [None]:
import re

In [None]:
arabic_pattern = '[\u0621-\u064A]'

# Filter rows where 'Base Prediction' does not contain Arabic letters
cls_df_without_arabic = classification_data_df[classification_data_df['Base Prediction'].apply(lambda x: not bool(re.search(arabic_pattern, x)))]


In [None]:
matches = cls_df_without_empty['output'] == cls_df_without_empty['Base Prediction']
accuracy_without_empty = matches.mean() * 100  
print(f"Accuracy: {accuracy_without_empty:.2f}%")

## Base Model (Input refined for API)

In [6]:
classification_data_df = pd.read_json("test_cls_final.json", encoding="utf-8")

In [None]:
classification_data_df['Base Prediction'] = 'Pred'

In [None]:
classification_data_df.head()

In [None]:
for i in range(len(classification_data_df)):
    classification_data_df.iloc[i,2] = get_response(classification_data_df.iloc[i,0])

Classifying 200 samples took 1m 37.3s --> 0.355s for each

In [None]:
cls_df_base = classification_data_df.copy()

Sort them to avoid taking the least

In [None]:
meters_sorted = sorted(meters, key=len)

In [None]:
# for res in classification_data_df['Base Prediction'].values:
#     for meter in meters_sorted: 
#         if meter in res:
            

# cls_df_base.loc[cls_df_base['Base Prediction'] in , 'Base Prediction'] = cls_df_base['Base Res'] * 2
changed = []
for idx, res in enumerate(classification_data_df['Base Prediction'].values):
    for meter in meters_sorted:
        if meter in res:
            ch_dict = {'Index': idx, "Before": res, 'After': meter}
            changed.append(ch_dict)
            classification_data_df.at[idx, 'Base Prediction'] = meter
print(changed)
print(len(changed))
print(len(classification_data_df))

In [None]:
classification_data_df['Base Prediction'].values

Accuracy with Empty

In [None]:
matches = classification_data_df['output'] == classification_data_df['Base Prediction']
accuracy = matches.mean() * 100  
print(f"Accuracy: {accuracy:.2f}%")

Accuracy without Empty

In [None]:
cls_df_without_empty = classification_data_df[classification_data_df['Base Prediction'] != ' ']

In [None]:
import re

In [None]:
arabic_pattern = '[\u0621-\u064A]'

# Filter rows where 'Base Prediction' does not contain Arabic letters
cls_df_without_arabic = classification_data_df[classification_data_df['Base Prediction'].apply(lambda x: not bool(re.search(arabic_pattern, x)))]


In [None]:
matches = cls_df_without_empty['output'] == cls_df_without_empty['Base Prediction']
accuracy_without_empty = matches.mean() * 100  
print(f"Accuracy: {accuracy_without_empty:.2f}%")

## Fine-tuned Results

## Fine-tuned Results

In [None]:
with open('results_cls_formatted.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()  
    
fine_tuned_preds =  [line.strip() for line in lines]

print(lines)

In [None]:
fine_tuned_preds = [
    ' '.join(line.replace('\n', '').split()) for line in fine_tuned_preds
]

print(fine_tuned_preds)

In [None]:
import re

In [None]:
fine_tuned_preds = [re.sub(r'[^\u0600-\u06FF]+', '', pred).strip() for pred in fine_tuned_preds]

In [None]:
for p in fine_tuned_preds:
    print(p)

In [None]:
classification_data_df["Fine-Tuning Prediction"] = fine_tuned_preds

In [None]:
classification_data_df[classification_data_df['Fine-Tuning Prediction'] == ''].count()

Calc

In [None]:
matches = classification_data_df['output'] == classification_data_df['Fine-Tuning Prediction']
accuracy = matches.mean() * 100  
print(f"Accuracy: {accuracy:.2f}%")

wrong_rows_df = classification_data_df[~matches]
print("Rows where col1 and col2 don't match:")
print(wrong_rows_df)

In [None]:
cls_df_without_empty = classification_data_df[classification_data_df['Fine-Tuning Prediction'] != '']

In [None]:
matches_without_empty = cls_df_without_empty['output'] == cls_df_without_empty['Fine-Tuning Prediction']
accuracy_without_empty  = matches_without_empty .mean() * 100  
print(f"Accuracy: {accuracy_without_empty:.2f}%")

# wrong_rows_df = cls_df_without_empty[~matches]
# print("Rows where col1 and col2 don't match:")
# print(wrong_rows_df)

In [None]:
cls_df_base['output'].unique()

In [None]:
fine_tuned_preds = [
    ' '.join(line.replace('\n', '').split()) for line in fine_tuned_preds
]

print(fine_tuned_preds)

In [None]:
import re

In [None]:
fine_tuned_preds = [re.sub(r'[^\u0600-\u06FF]+', '', pred).strip() for pred in fine_tuned_preds]

In [None]:
for p in fine_tuned_preds:
    print(p)

In [None]:
classification_data_df["Fine-Tuning Prediction"] = fine_tuned_preds

In [None]:
classification_data_df[classification_data_df['Fine-Tuning Prediction'] == ''].count()

Calc

In [None]:
matches = classification_data_df['output'] == classification_data_df['Fine-Tuning Prediction']
accuracy = matches.mean() * 100  
print(f"Accuracy: {accuracy:.2f}%")

wrong_rows_df = classification_data_df[~matches]
print("Rows where col1 and col2 don't match:")
print(wrong_rows_df)

In [None]:
cls_df_without_empty = classification_data_df[classification_data_df['Fine-Tuning Prediction'] != '']

In [None]:
matches_without_empty = cls_df_without_empty['output'] == cls_df_without_empty['Fine-Tuning Prediction']
accuracy_without_empty  = matches_without_empty .mean() * 100  
print(f"Accuracy: {accuracy_without_empty:.2f}%")

# wrong_rows_df = cls_df_without_empty[~matches]
# print("Rows where col1 and col2 don't match:")
# print(wrong_rows_df)

In [None]:
cls_df_base['output'].unique()

## 4o Results: 

In [1]:
with open('GPT_4o_Cls.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()  
    
gpt_preds =  [line.strip() for line in lines]


In [11]:
classification_data_df_100 = classification_data_df.head(100)

classification_data_df_100["GPT_4o Prediction"] = gpt_preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  classification_data_df_100["GPT_4o Prediction"] = gpt_preds


Calc

In [12]:
matches = classification_data_df_100['output'] == classification_data_df_100['GPT_4o Prediction']
accuracy = matches.mean() * 100  
print(f"Accuracy: {accuracy:.2f}%")

Accuracy: 19.00%


In [None]:
cls_df_without_empty = classification_data_df[classification_data_df['Fine-Tuning Prediction'] != '']

In [None]:
matches_without_empty = cls_df_without_empty['output'] == cls_df_without_empty['Fine-Tuning Prediction']
accuracy_without_empty  = matches_without_empty .mean() * 100  
print(f"Accuracy: {accuracy_without_empty:.2f}%")

# wrong_rows_df = cls_df_without_empty[~matches]
# print("Rows where col1 and col2 don't match:")
# print(wrong_rows_df)

In [None]:
cls_df_base['output'].unique()

In [None]:
fine_tuned_preds = [
    ' '.join(line.replace('\n', '').split()) for line in fine_tuned_preds
]

print(fine_tuned_preds)

In [None]:
import re

In [None]:
fine_tuned_preds = [re.sub(r'[^\u0600-\u06FF]+', '', pred).strip() for pred in fine_tuned_preds]

In [None]:
for p in fine_tuned_preds:
    print(p)

In [None]:
classification_data_df["Fine-Tuning Prediction"] = fine_tuned_preds

In [None]:
classification_data_df[classification_data_df['Fine-Tuning Prediction'] == ''].count()

Calc

In [None]:
matches = classification_data_df['output'] == classification_data_df['Fine-Tuning Prediction']
accuracy = matches.mean() * 100  
print(f"Accuracy: {accuracy:.2f}%")

wrong_rows_df = classification_data_df[~matches]
print("Rows where col1 and col2 don't match:")
print(wrong_rows_df)

In [None]:
cls_df_without_empty = classification_data_df[classification_data_df['Fine-Tuning Prediction'] != '']

In [None]:
matches_without_empty = cls_df_without_empty['output'] == cls_df_without_empty['Fine-Tuning Prediction']
accuracy_without_empty  = matches_without_empty .mean() * 100  
print(f"Accuracy: {accuracy_without_empty:.2f}%")

# wrong_rows_df = cls_df_without_empty[~matches]
# print("Rows where col1 and col2 don't match:")
# print(wrong_rows_df)

In [None]:
cls_df_base['output'].unique()