In [1]:
import pandas as pd

# Define the CSV file path
csv_file_path = "/Users/anirudhhegde/Desktop/Northeastern University/Natural Language Processing/Project/RAG-Based-Multilingual-News-Retrieval/data.csv"

# Load the CSV as a DataFrame
try:
    df = pd.read_csv(csv_file_path)
    print("DataFrame loaded successfully!")
    print(df.head())  # Display the first few rows of the DataFrame
except Exception as e:
    print(f"Error loading the CSV file: {e}")

DataFrame loaded successfully!
                                                text  \
0  Transport im Viehwaggon, Fleischgeruch in der ...   
1  Marmorner Zebrastreifen, pompöse Gebäude: Sind...   
2  Wenn an diesem Montag die Landesvorsitzenden d...   
3  Das Portrait von 1791 zeigt Haitis Nationalhel...   
4  Neue Köpfe und alte Bekannte: Die neue Regieru...   

                                             summary    topic  \
0  Transport im Viehwaggon, Fleischgeruch in der ...  politik   
1  Marmorner Zebrastreifen, pompöse Gebäude: Sind...  politik   
2  Oskar Lafontaine gibt den Parteivorsitz der Li...  politik   
3  Die Wurzeln des Elends liegen in der Vergangen...  politik   
4  Schwarz-Gelb ist noch nicht jene Traumkoalitio...  politik   

                                               title        date language  \
0  So war Auschwitz: Erinnerungen einer Holocaust...  00/01/2010       de   
1  Kommunen in Not (3): Sindelfingen - Jenseits g...  00/01/2010       de   
2  Persona

In [2]:
df.head()

Unnamed: 0,text,summary,topic,title,date,language,translated_text
0,"Transport im Viehwaggon, Fleischgeruch in der ...","Transport im Viehwaggon, Fleischgeruch in der ...",politik,So war Auschwitz: Erinnerungen einer Holocaust...,00/01/2010,de,"Transport in a cattle carriage, smell of meat ..."
1,"Marmorner Zebrastreifen, pompöse Gebäude: Sind...","Marmorner Zebrastreifen, pompöse Gebäude: Sind...",politik,Kommunen in Not (3): Sindelfingen - Jenseits g...,00/01/2010,de,"Marble zebra stripes, pompous buildings: Sinde..."
2,Wenn an diesem Montag die Landesvorsitzenden d...,Oskar Lafontaine gibt den Parteivorsitz der Li...,politik,Personaldebatte bei der Linken - Wer kommt nac...,00/01/2010,de,"This Monday, when the country’s left-wing lead..."
3,Das Portrait von 1791 zeigt Haitis Nationalhel...,Die Wurzeln des Elends liegen in der Vergangen...,politik,Geschichte von Haiti - Napoleons Schmach,00/01/2010,de,The portrait of 1791 shows Haiti’s national he...
4,Neue Köpfe und alte Bekannte: Die neue Regieru...,Schwarz-Gelb ist noch nicht jene Traumkoalitio...,politik,Schwarz-gelbes Kabinett - Merkels Mannschaft i...,00/01/2010,de,New heads and old acquaintances: Angela Merkel...


## RAG_1

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

# Load the SentenceTransformer model for multilingual embeddings
embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# Generate embeddings for the 'translated_text'
print("Generating embeddings for the articles...")
df['embedding'] = df['translated_text'].apply(lambda x: embedding_model.encode(x))

# Convert embeddings into a numpy array
embeddings = np.vstack(df['embedding'].values)

# Initialize FAISS index for vector storage and search
dimension = embeddings.shape[1]  # Embedding dimension
faiss_index = faiss.IndexFlatL2(dimension)  # L2 distance metric
faiss_index.add(embeddings)  # Add embeddings to the FAISS index
print(f"FAISS index contains {faiss_index.ntotal} items.")

  from .autonotebook import tqdm as notebook_tqdm


Generating embeddings for the articles...
FAISS index contains 6000 items.


In [4]:
def encode_query(query):
    """Generate embedding for a user query."""
    return embedding_model.encode(query)

# Example user query
query = "What were the effects of Napoleonic wars in Haiti?"
query_embedding = encode_query(query)

In [None]:
def search_faiss(query_embedding, top_k=5):
    """Retrieve top-k most similar articles from FAISS index."""
    query_embedding = np.array([query_embedding])  # Convert query to 2D array
    distances, indices = faiss_index.search(query_embedding, top_k)
    return indices[0], distances[0]

# Search for the most relevant articles
top_k = 5
indices, distances = search_faiss(query_embedding, top_k=top_k)

# Retrieve articles based on indices
retrieved_articles = df.iloc[indices]
print("Retrieved Articles:")
print(retrieved_articles[['title', 'translated_text', 'summary']])

Retrieved Articles:
                                                  title  \
1847              La cruel historia de un país olvidado   
3566  Haïti dévasté après un violent tremblement de ...   
3              Geschichte von Haiti - Napoleons Schmach   
5298                      Aynı adada cennet ve cehennem   
3590  Cyclones, émeutes de la faim… un pays qui vit ...   

                                        translated_text  \
1847  The earthquake that has destroyed Haiti only c...   
3566  The earthquake that devastated Haiti at the en...   
3     The portrait of 1791 shows Haiti’s national he...   
5298  Haiti, one of the world's poorest nations, has...   
3590  The earthquake that struck Haiti on Tuesday, J...   

                                                summary  
1847  Constantes crisis políticas y sociales azotan ...  
3566  La mobilisation s'organise pour secourir le pa...  
3     Die Wurzeln des Elends liegen in der Vergangen...  
5298  Aynı adayı paylaşıyorlar ama hay

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch

# Automatically select MPS if available, otherwise fallback to CPU
device = "mps" if torch.backends.mps.is_available() else "cpu"

from transformers import T5Tokenizer, T5ForConditionalGeneration

generative_model_name = "t5-base"
generative_tokenizer = T5Tokenizer.from_pretrained(generative_model_name)
generative_model = T5ForConditionalGeneration.from_pretrained(generative_model_name).to(device)

def generate_answer(query, retrieved_texts):
    input_text = f"question: {query} context: {retrieved_texts}"
    inputs = generative_tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512).to(device)
    outputs = generative_model.generate(
        inputs.input_ids, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True
    )
    return generative_tokenizer.decode(outputs[0], skip_special_tokens=True)

# Prepare retrieved texts for the generative model
retrieved_texts = retrieved_articles['summary'].tolist()

# Generate an answer
final_answer = generate_answer(query, retrieved_texts)
print("\nGenerated Answer:")
print(final_answer)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565



Generated Answer:
Haiti bezahlt immer noch für seine Befreiung vor 200 Jahren. Auch damals nahmen die Wichtigen der Welt den Insel-Staat nicht ernst.' 'Ayn aday paylaşyorlar ama hayatlar çok farkl.


## RAG_2


In [14]:
df_2 = pd.read_csv("/Users/anirudhhegde/Desktop/Northeastern University/Natural Language Processing/Project/RAG-Based-Multilingual-News-Retrieval/datasets_translated_dataset_v3/dataset.csv")

In [12]:
df_2.head()

Unnamed: 0,summary,topic,title,date,translated_text
0,"Transport in a cattle carriage, smell of meat ...",Politics,Auschwitz: Memories of a Holocaust Survivor,00/01/2010,"Transport in a cattle carriage, smell of meat ..."
1,"Marble zebra stripes, pompous buildings: Sinde...",Politics,Municipalities in Need (3): Sindelfingen - Bey...,00/01/2010,"Marble zebra stripes, pompous buildings: Sinde..."
2,Oskar Lafontaine resigns as party chairman of ...,Politics,Staff debate on the left - who is coming to La...,00/01/2010,"This Monday, when the country’s left-wing lead..."
3,The roots of poverty lie in the past. Haiti is...,Politics,History of Haiti - Napoleon's disgrace,00/01/2010,The portrait of 1791 shows Haiti’s national he...
4,Black-yellow is not yet the dream coalition th...,Politics,Black-and-yellow cabinet - Merkel's team in th...,00/01/2010,New heads and old acquaintances: Angela Merkel...


In [31]:
df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   summary          3000 non-null   object
 1   topic            3000 non-null   object
 2   title            3000 non-null   object
 3   date             3000 non-null   object
 4   translated_text  3000 non-null   object
 5   embedding        3000 non-null   object
dtypes: object(6)
memory usage: 140.8+ KB


In [42]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

# Load the SentenceTransformer model for multilingual embeddings
embedding_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')

# Generate embeddings for the 'translated_text'
print("Generating embeddings for the articles...")
df_2['embedding'] = df_2['translated_text'].apply(lambda x: embedding_model.encode(x, normalize_embeddings = True))

# Convert embeddings into a numpy array
embeddings = np.vstack(df_2['embedding'].values)

# Initialize FAISS index for vector storage and search
dimension = embeddings.shape[1]  # Embedding dimension
faiss_index = faiss.IndexFlatL2(dimension)  # L2 distance metric
faiss_index.add(embeddings)  # Add embeddings to the FAISS index
print(f"FAISS index contains {faiss_index.ntotal} items.")

Generating embeddings for the articles...
FAISS index contains 3000 items.


In [43]:
def encode_query(query):
    """Generate embedding for a user query."""
    return embedding_model.encode(query, normalize_embeddings =True)

# Example user query
# query = "America donald trump election"
query = 'terrorist islam pakistan afghanistan bangladesh'
query_embedding = encode_query(query)

In [44]:
def search_faiss(query_embedding, top_k=5):
    """Retrieve top-k most similar articles from FAISS index."""
    query_embedding = np.array([query_embedding])  # Convert query to 2D array
    distances, indices = faiss_index.search(query_embedding, top_k)
    return indices[0], distances[0]

# Search for the most relevant articles
top_k = 5
indices, distances = search_faiss(query_embedding, top_k=top_k)

# Retrieve articles based on indices
retrieved_articles = df_2.iloc[indices]
print("Retrieved Articles:")
print(retrieved_articles[['title','topic', 'translated_text', 'summary']])

Retrieved Articles:
                                                  title  \
1293  In this regard, I would like to draw the atten...   
529   Pakistan: Many dead in Lahore - Two attacks in...   
2474  Pakistan: suicide bombing in the western part ...   
2788  Nine alleged Al Qaeda members soon to be tried...   
1660  The Committee recommends that the State party ...   

                                                  topic  \
1293                                     diario portada   
529                                            Politics   
2474                                       Asia-Pacific   
2788  europe europe, europe, europe, europe, europe,...   
1660                           International Journalism   

                                        translated_text  \
1293  Al Qaeda in the Islamic Maghreb, the branch of...   
529   Bomb terror in the "Heart of Pakistan": Two su...   
2474  Is the sanctuary of the Pakistani Taliban and ...   
2788  Nine suspected members of th

In [41]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity between query and retrieved articles
retrieved_embeddings = embeddings[indices]
cosine_similarities = cosine_similarity([query_embedding], retrieved_embeddings).flatten()

# Add similarity scores to the retrieved articles for analysis
retrieved_articles = retrieved_articles.copy()
retrieved_articles['similarity'] = cosine_similarities

# Sort articles by similarity score
retrieved_articles = retrieved_articles.sort_values(by='similarity', ascending=False)
print("Retrieved Articles with Similarity Scores:")
print(retrieved_articles[['title', 'topic', 'translated_text', 'summary','similarity']])

Retrieved Articles with Similarity Scores:
                                                  title  \
2474  Pakistan: suicide bombing in the western part ...   
2515  Yemen: Imam allegedly linked to Fort Hood and ...   
2538    """Terrorism is not the main problem in Yemen""   
2775         "Tehran bans ""any contact"" with 60 NGOs"   
529   Pakistan: Many dead in Lahore - Two attacks in...   

                                                  topic  \
2474                                       Asia-Pacific   
2515  . . . . . . . . . . . . . . . . . . . . . . . ...   
2538                                        Middle East   
2775                                        Middle East   
529                                            Politics   

                                        translated_text  \
2474  Is the sanctuary of the Pakistani Taliban and ...   
2515  President Barack Obama's anti-terrorism advise...   
2538  Omar Faouk Abdulmutallab, the perpetrator of t...   
2775  On Mo

In [45]:
from bert_score import score
import pandas as pd

# Assuming df_2 contains the retrieved articles
# Rename df_2 to df_retrieved for consistency with the code
df_retrieved = retrieved_articles.copy()

# Ensure columns exist in the DataFrame
if "summary" not in df_retrieved.columns or "translated_text" not in df_retrieved.columns:
    raise ValueError("DataFrame must contain 'summary' and 'translated_text' columns for similarity comparison.")

# Task 1: Similarity Between Query and Summaries
query_list = [query] * len(df_retrieved)  # Repeat the query for all summaries
P, R, F1_query_summary = score(
    df_retrieved["summary"].tolist(),
    query_list,
    lang="en",
    model_type="bert-base-uncased"
)
df_retrieved["query_summary_similarity"] = F1_query_summary.tolist()

# Task 2: Similarity Between News Articles (translated_text) and Summaries
P, R, F1_news_summary = score(
    df_retrieved["summary"].tolist(),
    df_retrieved["translated_text"].tolist(),
    lang="en",
    model_type="bert-base-uncased"
)
df_retrieved["news_summary_similarity"] = F1_news_summary.tolist()

# Display the DataFrame with similarity scores
df_bertscore = df_retrieved[["title", "translated_text", "summary", "query_summary_similarity", "news_summary_similarity"]]
print(df_bertscore)

                                                  title  \
1293  In this regard, I would like to draw the atten...   
529   Pakistan: Many dead in Lahore - Two attacks in...   
2474  Pakistan: suicide bombing in the western part ...   
2788  Nine alleged Al Qaeda members soon to be tried...   
1660  The Committee recommends that the State party ...   

                                        translated_text  \
1293  Al Qaeda in the Islamic Maghreb, the branch of...   
529   Bomb terror in the "Heart of Pakistan": Two su...   
2474  Is the sanctuary of the Pakistani Taliban and ...   
2788  Nine suspected members of the Al-Qaida network...   
1660  The Jordanian suicide bomber who killed seven ...   

                                                summary  \
1293  In addition, the Committee recommends that the...   
529   Bomb terror in the "Heart of Pakistan": Two su...   
2474  The Taliban, pressed by the Pakistani army, ar...   
2788  Nine alleged members of the Al Qaeda network, ..