### **1 - Importing Libraries**

In [None]:
from unsloth import FastLanguageModel
import pandas as pd
import re
import json

### **2 - Loading Configuration**

In [3]:
with open('/kaggle/input/configure/config.json', 'r') as file:
    config = json.load(file)

# general configuration
HGF = config['general']['HGF']


# model configuration
output_model_online_Desc = config['outputs']['output_model_online_Desc']
output_model_online_Rec = config['outputs']['output_model_online_Rec']

max_seq_length = config['model']['max_seq_length']
load_in_4bit = config['model']['load_in_4bit']

### **3 - Loading Framework Components**

In [None]:
model_desc, tokenizer_desc = FastLanguageModel.from_pretrained(
    model_name = output_model_online_Desc,
    max_seq_length = max_seq_length,
    load_in_4bit = load_in_4bit,
    token = HGF
)

In [None]:
model_rec, tokenizer_rec = FastLanguageModel.from_pretrained(
    model_name = output_model_online_Rec,
    max_seq_length = max_seq_length,
    load_in_4bit = load_in_4bit,
    token = HGF
)

### **4 - Defining Prompt Template**

In [17]:
promptDesc = """ Below is an instruction that describes a task, paired with an input that provied further context.
Write a response that appropiately completes the request.

### Instruction:
You are an interests analyzer. Based on the following user history, analyze their reading habits and generate a description of what kind of news articles they might be interested in reading next. 

### History:
{}

### Response:
Description : \n
{}

"""

In [18]:
promptRec = """Below is an instruction that describes a task, paired with an input that provides further context. 
Write a response that appropriately completes the request. 
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You serve as a personalized news article recommendation system. Based on the user's preference descriptions below and the candidate articles, rank the candidates using their labels.
Output Format:
Ranked News Articles: <START> C#, C#, ..., C# <END>

### Preferences Description:
{}

### Candidates:
{}

### Response:
<think>{} """

### **5 - Assembling the Framework**

In [45]:
def generateDescription(history, model, tokenizer):
    inputs = tokenizer([promptDesc.format(history, "")], return_tensors="pt").to("cuda")
        
    outputs = model.generate(
                input_ids=inputs.input_ids,
                attention_mask=inputs.attention_mask,
                max_new_tokens= 1000
            )
    response = tokenizer.batch_decode(outputs)
    description = response[0].split("### Response:")[1].split("\nDescription : \n\n\n\n")[1].replace("<｜end▁of▁sentence｜>","")
    return description

def recommendNews(description, candidates, model, tokenizer):

    inputs = tokenizer([promptRec.format(description, candidates, "")], return_tensors="pt").to("cuda")

    outputs = model.generate(
                input_ids=inputs.input_ids,
                attention_mask=inputs.attention_mask,
                max_new_tokens=1000
            )
    response = tokenizer.batch_decode(outputs)
    result = response[0].split("### Response:")[1]
    cot_match = re.search(r'<think>(.*?)</think>', result, re.DOTALL)
    if cot_match:
        cot = cot_match.group(1).strip()
    news_match = re.search(r'Ranked News Articles\s*:\s*(.*)', result)
    if news_match:
                after_phrase = news_match.group(1)
                rankedArticles = re.findall(r'C\d+', after_phrase)

    rankedArticles = list(dict.fromkeys(re.findall(r'C\d+', after_phrase) + re.findall(r'C\d+', candidates)))

    return cot, rankedArticles

def FrameWork(history, candidates):
     
    description = generateDescription(history, model_desc, tokenizer_desc)
    cot, rankedArticles = recommendNews(description, candidates , model_rec ,tokenizer_rec)

    return description, cot, rankedArticles

### **6 - Using the Framewrok**

In [38]:
df = pd.read_csv("/kaggle/input/datasettest/test.csv")

In [43]:
instance = df.iloc[2]
history = instance['history']
candidates = instance['Candidates']
label = instance['Labels']

In [46]:
description, cot, ranked_Articles = FrameWork(history, candidates)

print(f"User history: \n {history}\n")
print(f"List of candidates: \n {candidates}\n")
print(f"Description generated: \n {description}\n")
print(f"Chain of thought generated: \n {cot}\n")
print(f"Ranked articles: \n {ranked_Articles}\n")
print(f"The article the user actually read next: {label}")

User history: 
 H1: The 2019 NFL midseason MVP rankings which belongs to the category of sports and subcategory of football_nfl
H2: Week 9 winners, losers: Russell Wilson taking lead in MVP race; Adam Gase in trouble which belongs to the category of sports and subcategory of football_nfl
H3: Tom Brady explains why he's angry at this point in the football season which belongs to the category of sports and subcategory of football_nfl

List of candidates: 
 C1: Police find 26 children behind false wall at Colorado day care which belongs to the category of news and subcategory of crime
C2: Meghan Markle's Lawyers Debunk Multiple False Tabloid Stories in New Court Documents which belongs to the category of lifestyle and subcategory of lifestyleroyals
C3: This stately home is having the ultimate yard sale which belongs to the category of finance and subcategory of finance-real-estate
C4: 66 Cool Tech Gifts Anyone Would Be Thrilled to Receive which belongs to the category of lifestyle and sub