### **Importing Libraries**

In [None]:
import json
import openai
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import numpy as np
from transformers import pipeline
from fuzzywuzzy import fuzz
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

### **Dataset Loading**

In [None]:
with open('./Dataset/fact_checks.json', 'r') as f:
    fact_check_db = json.load(f)

num_facts_to_load = int(len(fact_check_db) * 0.1)
facts_subset = fact_check_db[:num_facts_to_load]
print(len(facts_subset))

with open('./Dataset/posts.json', 'r') as f:
    posts = json.load(f)
num_posts_to_load = int(len(posts) * 0.3)
posts_subset = posts[:num_posts_to_load]
print(len(posts_subset))



### **Model** Implementation-1

In [None]:
# def load_model(multi=True):
#     if multi:
#         model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
#     else:
#         model = SentenceTransformer('all-MiniLM-L6-v2')
#     return model

# def semantic_clustering(corpus, model, k=10):
#     corpus_embeddings = model.encode(corpus)
    
#     kmeans = KMeans(n_clusters=k, random_state=0)
#     clusters = kmeans.fit_predict(corpus_embeddings)

#     cluster_groups = {i: [] for i in range(k)}
#     for idx, cluster_id in enumerate(clusters):
#         cluster_groups[cluster_id].append(corpus[idx])
    
#     labels = assign_labels(cluster_groups)
    
#     return clusters, labels

# def assign_labels(cluster_groups):
#     generator = pipeline('text-generation', model='gpt2')
#     labels = []
#     for cluster_id, items in cluster_groups.items():
#         prompt = f"Assign a semantic label to the following claims:\n"
#         for item in items[:5]:  # Limit to first 5 items to avoid exceeding max length
#             prompt += f"- {item}\n"
#         prompt += "Label:"
        
#         response = generator(prompt, max_length=len(prompt) + 10, num_return_sequences=1)
#         label = response[0]['generated_text'].split("Label:")[-1].strip()
#         labels.append(label)
#     return labels

# def fuzzymatch(query, labels, cluster_groups):
#     matched_clusters = []
#     for idx, label in enumerate(labels):
#         score = fuzz.ratio(query, label)
#         if score > 60:  # Adjust threshold
#             matched_clusters.append(cluster_groups[idx])
#     return matched_clusters

# def evaluate_supportiveness(query, cluster):
#     classifier = pipeline('text-classification', model='distilbert-base-uncased-finetuned-sst-2-english')
    
#     supportive_claims = []
#     for claim in cluster:
#         text = f"Query: {query}\nClaim: {claim}"
#         result = classifier(text)[0]
#         if result['label'] == 'POSITIVE' and result['score'] > 0.6:  # Adjust threshold as needed
#             supportive_claims.append(claim)
    
#     return supportive_claims

# # Main function
# def fasttrack_algorithm(queries, corpus, multi=True):
#     model = load_model(multi)
#     Dsel = []
    
#     # Stage 1: Semantic Clustering
#     clusters, labels = semantic_clustering(corpus, model, k=10)
    
#     # Stage 2: Tracing (for each query)
#     for query in queries:
#         Dq = [] 
        
#         matched_clusters = fuzzymatch(query, labels, clusters)
        
#         for cluster in matched_clusters:
#             supportive_claims = evaluate_supportiveness(query, cluster)
#             Dq.extend(supportive_claims)
        
#         Dsel.extend(Dq)
    
#     return Dsel


# queries = [
#     f"{post['translated_ocr']}"
#     for post in posts_subset
# ]

# corpus = [
#     f"{fact['translation_sentence']}"
#     for fact in facts_subset
# ]
# fasttrack_algorithm(queries,corpus,False)


### **Model** Implementation-2

In [None]:
def load_model(device):
    model = SentenceTransformer('xlm-roberta-base')
    model = model.to(device)
    return model

def semantic_clustering(corpus, model, k=10, device='cuda'):
    corpus_embeddings = model.encode(corpus, device=device)
    
    kmeans = KMeans(n_clusters=k, random_state=0)
    clusters = kmeans.fit_predict(corpus_embeddings.cpu().numpy())

    cluster_groups = {i: [] for i in range(k)}
    for idx, cluster_id in enumerate(clusters):
        cluster_groups[cluster_id].append(corpus[idx])
    
    labels = assign_labels(cluster_groups, device)
    
    return clusters, labels

def assign_labels(cluster_groups, device):
    generator = pipeline('text-generation', model='gpt2', device=0 if device == 'cuda' else -1)
    labels = []
    for cluster_id, items in cluster_groups.items():
        prompt = f"Assign a semantic label to the following claims:\n"
        for item in items[:5]:  # Limit to first 5 items to avoid exceeding max length
            prompt += f"- {item}\n"
        prompt += "Label:"
        
        response = generator(prompt, max_length=len(prompt) + 10, num_return_sequences=1)
        label = response[0]['generated_text'].split("Label:")[-1].strip()
        labels.append(label)
    return labels

def cross_lingual_similarity(query, claim, model, device):
    query_embedding = model.encode(query, device=device)
    claim_embedding = model.encode(claim, device=device)
    return torch.cosine_similarity(query_embedding, claim_embedding, dim=0).item()

def evaluate_relevance(query, claim, tokenizer, model, device):
    inputs = tokenizer(query, claim, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    relevance_score = torch.softmax(outputs.logits, dim=1)[0][1].item()  # Assuming 1 is the relevant class
    return relevance_score

def fasttrack_algorithm(queries, corpus, device='cuda'):
    if not torch.cuda.is_available() and device == 'cuda':
        print("CUDA is not available. Using CPU instead.")
        device = 'cpu'
    
    model = load_model(device)
    tokenizer = AutoTokenizer.from_pretrained("fine-tuned-fact-check-relevance-model")
    relevance_model = AutoModelForSequenceClassification.from_pretrained("fine-tuned-fact-check-relevance-model").to(device)
    
    results = []
    
    # Stage 1: Semantic Clustering
    clusters, labels = semantic_clustering(corpus, model, k=10, device=device)
    
    # Stage 2: Retrieval and Ranking
    for query in queries:
        relevant_claims = []
        
        for cluster in clusters:
            for claim in cluster:
                similarity = cross_lingual_similarity(query, claim, model, device)
                relevance = evaluate_relevance(query, claim, tokenizer, relevance_model, device)
                
                if similarity > 0.5 and relevance > 0.5:  # Adjust thresholds as needed
                    relevant_claims.append((claim, similarity * relevance))
        
        # Rank the relevant claims
        ranked_claims = sorted(relevant_claims, key=lambda x: x[1], reverse=True)
        results.append(ranked_claims[:10])  # Top 10 most relevant claims
    
    return results

queries = [
    f"{post['translated_ocr']}"
    for post in posts_subset
]

corpus = [
    f"{fact['translation_sentence']}"
    for fact in facts_subset
]

device = 'cuda' if torch.cuda.is_available() else 'cpu'
fasttrack_algorithm(queries, corpus, device)
