In [11]:
from IPython.display import clear_output

In [1]:
import random
import json

import torch
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

from transformers import AutoModel

import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from ibm_watsonx_ai.foundation_models import Model

In [5]:
test_df = pd.read_csv('train_data.csv')

In [6]:
author_to_idx_mapping = {name: i for i, name in enumerate(test_df['AuthorName'].sort_values().unique().tolist())}
idx_to_author_mapping = {v: k for k, v in author_to_idx_mapping.items()}

## Initializing the model

In [7]:
class TextClassifier(nn.Module):
    def __init__(self, input_size=1024, hidden_size=512, num_classes=10):
        super(TextClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.ln1 = nn.LayerNorm(hidden_size)  # LayerNorm after the first layer
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size // 2)
        self.ln2 = nn.LayerNorm(hidden_size // 2)  # LayerNorm after the second layer
        self.fc3 = nn.Linear(hidden_size // 2, num_classes)

    def forward(self, x):
        x = self.fc1(x)
        x = self.ln1(x)  # Apply layer normalization
        x = self.relu(x)
        x = self.fc2(x)
        x = self.ln2(x)  # Apply layer normalization
        x = self.relu(x)
        x = self.fc3(x)
        x = torch.softmax(x, dim=-1)
        return x

In [8]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [12]:
embedding_model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", trust_remote_code=True).to(device)

clear_output()

In [13]:
input_dim = 1024
hidden_dim = 1024
n_classes = len(author_to_idx_mapping)

classification_head = TextClassifier(input_dim, hidden_dim, n_classes)
classification_head.load_state_dict(torch.load('author_classifier_head_jina_embeddings.pt'))
classification_head.to(device)

TextClassifier(
  (fc1): Linear(in_features=1024, out_features=1024, bias=True)
  (ln1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  (relu): ReLU()
  (fc2): Linear(in_features=1024, out_features=512, bias=True)
  (ln2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (fc3): Linear(in_features=512, out_features=10, bias=True)
)

In [69]:
def get_classification_logits(inputs):
    
    with torch.no_grad():

        embeddings = embedding_model.encode(inputs, max_length=2048)
        embeddings = torch.from_numpy(embeddings).to(device)
        class_logits = classification_head(embeddings)
    
    return class_logits


def classify(inputs):

    
    logits = get_classification_logits(inputs)
    classes = torch.argmax(logits, -1)
    
    return classes.tolist()


def classify_and_get_prob(inputs, author_names):

    if isinstance(author_names, str):
        author_names = [author_names]*len(inputs)

    logits = get_classification_logits(inputs).tolist()

    probs = [logit[author_to_idx_mapping[author_name]] for logit, author_name in zip(logits, author_names)]
    return probs


In [15]:
tdf = test_df.sample(n=2).copy()
tauthors = tdf['AuthorName'].tolist()
classes = classify(tdf['ChapterText'].tolist())

In [16]:
touts = [idx_to_author_mapping[i] for i in classes]

print(tauthors, touts)

['أبو حيان التوحيدي', 'ثورنتون دبليو برجس'] ['أبو حيان التوحيدي', 'ثورنتون دبليو برجس']


## Preparing ALLAM API model

In [17]:
credentials = {"url": "https://eu-de.ml.cloud.ibm.com", "apikey": "JsA-9hFRsH1msnDzUkcRpoUhSi3r3dfQMAj8Ic9yRiNg"}

project_id = "43ada3b1-5d11-42e8-9ef1-c718b99c05e1"
model_id = "sdaia/allam-1-13b-instruct"

parameters = {
    # "decoding_method": "greedy",
    "max_new_tokens": 500,
    # "repetition_penalty": 1.05
}

In [18]:
# Initialize the model
model = Model(
    model_id=model_id,
    params=parameters,
    credentials=credentials,
    project_id=project_id
)

## Testing the performance

In [19]:
def preprocess_text(text):

    words = text.split()
    words = words[:400]
    processed_text = ' '.join(words)

    return processed_text

In [20]:
PROMPT_TEMPLATE = """
You are an expert text analyst specializing in authorship identification.
You will first be given several text pieces, all written by the same known author, to analyze their unique writing style, including linguistic patterns, tone, sentence structure, word choice, and overall style.
Afterward, you will be presented with an unknown article.
Your task is to critically analyze the writing style of the known text pieces and rewrite the unknown article in the same style.
Make sure that the content of the actual article does not change. Only it's writing style.

AUTHOR NAME:
{author_name}

KNOWN TEXT PIECES:
{known_articles}

UNKNOWN TEXT PIECES:
{unknown_article}

REWRITTEN TEXT:
"""

In [None]:
# 10 authors x 3 experiments/author = 30 total experiments

# for every experiment, the known text comes from author's own texts
# the unknown text comes from a random author which is not our target author.

In [35]:
n_exp_per_author = 3
# author_to_rewrites = {}
rewrites_data = []

for author in tqdm(author_to_idx_mapping.keys()):

    author_mask = test_df['AuthorName']==author
    author_df = test_df[author_mask]
    non_author_df = test_df[~author_mask]

    prompts = []
    input_samples = []

    for _ in range(n_exp_per_author):

        author_examples = author_df.sample(n=2)['ChapterText'].apply(preprocess_text).tolist()

        input_row = non_author_df.sample(n=1).iloc[0]
        input_author_name = input_row['AuthorName']
        input_text = preprocess_text(input_row['ChapterText'])

        prompt = PROMPT_TEMPLATE.format(author_name=author,
                                        known_articles='\n'.join([f'{i}.   {exp}' for i, exp in enumerate(author_examples, 1)]),
                                        unknown_article=input_text)
        
        input_samples.append([author, author_examples, input_author_name, input_text])
        prompts.append(prompt)
        
    allam_rewritten_texts = model.generate_text(prompts, concurrency_limit=len(prompts))

    for sample, allam_rewritten_text in zip(input_samples, allam_rewritten_texts):
        sample.append(allam_rewritten_text)
        rewrites_data.append(sample)

100%|██████████| 10/10 [03:40<00:00, 22.09s/it]


In [36]:
len(rewrites_data)

30

In [37]:
tgt_author_names, tgt_author_examples, input_author_names, inputs, allam_outputs = list(zip(*rewrites_data))

In [54]:
rewritten_df = pd.DataFrame({
    'target_author_name': tgt_author_names,
    'target_author_examples': tgt_author_examples,
    'input_author_name': input_author_names,
    'input': inputs,
    'rewrite': allam_outputs
})

In [55]:
predicted_classes = classify(rewritten_df['rewrite'].tolist())

In [56]:
len(predicted_classes)

30

In [61]:
predicted_authors = [idx_to_author_mapping[idx] for idx in predicted_classes]
rewritten_df['predicted_author_name'] = predicted_authors

In [71]:
rewritten_df.sample(n=5)

Unnamed: 0,target_author_name,target_author_examples,input_author_name,input,rewrite,predicted_author_name
2,أبو حيان التوحيدي,[قال لي ليلة أخرى: ألا تتمِّم ما كنا به بدأْنا...,سليم حسن,إن أقدم عهد إقطاعي معلوم لنا من النقوش المصرية...,\nفي زمان هؤلاءِ من يُلْحَق بهم، ويَدخُل في زم...,محمد لطفي جمعة
8,زكي نجيب محمود,[في أوائل عام ١٩٥٦م تلقيت عرضًا من دار كبرى لل...,أبو حيان التوحيدي,هذه المسألة مكررة وقد مضى الجواب عنها مستقصًى ...,\nفي أوائل عام ١٩٥٦م، تلقيت عرضًا من دار نشر أ...,زكي نجيب محمود
3,ثورنتون دبليو برجس,[لَمْ يَكُنْ طَائِرُ الْقَرْقَفِ تومي وَنَقَّا...,زكي نجيب محمود,تسعة قُرَّاء من كل عشرة، لم يسمعوا من الفلسفة ...,\nفي هذا المقال، سنستكشف أسلوب الكتابة الفريد ...,سليم حسن
22,نقولا حداد,[في ربيع سنة ١٨٩٠ كان الإمبراطور فرنسوا جوزف م...,ثورنتون دبليو برجس,عِنْدَمَا صَادَفَ أَنْ سَمِعَ سامي الْقُنْدُسَ...,\nفي ربيع سنة ١٨٩٠، كان الإمبراطور فرنسوا جوزف...,نقولا حداد
28,يوسف إدريس,[(يُفتح الستار على عوض وسعد زغلول يَلعبان على ...,نقولا حداد,التمدن كما نعرفه الآن وكما عرفنا تاريخه الماضي...,\nفي هذا العمل المسرحي، الجمهور ليس مجرد متفرج...,يوسف إدريس


In [None]:
tgt_eq_pred_mask = rewritten_df['target_author_name']==rewritten_df['predicted_author_name']
print('EXACTLY TARGET AUTHOR', len(rewritten_df[tgt_eq_pred_mask]))       # 13
print('NOT EXACTLY TARGET AUTHOR', len(rewritten_df[~tgt_eq_pred_mask]))  # 17

In [73]:
unmodified_input_tgt_author_probs = classify_and_get_prob(rewritten_df['input'].tolist(), rewritten_df['target_author_name'].tolist())
rewritten_input_tgt_author_probs = classify_and_get_prob(rewritten_df['rewrite'].tolist(), rewritten_df['target_author_name'].tolist())

In [80]:
unmodified_input_tgt_author_probs = pd.Series(unmodified_input_tgt_author_probs)
rewritten_input_tgt_author_probs = pd.Series(rewritten_input_tgt_author_probs)

In [85]:
(rewritten_input_tgt_author_probs > unmodified_input_tgt_author_probs).sum()

29

In [3]:
29/30

0.9666666666666667

In [None]:
# make some generations using allam
# calculate the probability of target author
# loss = (1-probability)

# generations = model(input)
# 
