In [None]:
!pip install -U langchain-community


In [None]:
!pip install langchain transformers faiss-cpu datasets torchvision torchaudio sentence-transformers


In [None]:
!pip install -U langchain-huggingface


In [None]:
import pandas as pd
import torch
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from sentence_transformers import SentenceTransformer
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from langchain.vectorstores import FAISS
from transformers import pipeline, GPT2Tokenizer, GPT2LMHeadModel
import faiss
import numpy as np

# Load the dataset
data_path = '/content/cleaned_bbc_news_articless.csv'
df = pd.read_csv(data_path)

# Determine device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Combine the columns
def combine_columns(row, include_date=True):
    if include_date:
        return f"{row['cleaned_title']} {row['cleaned_text']} Published on: {row['pubDate']}"
    else:
        return f"{row['cleaned_title']} {row['cleaned_text']}"

df['combined_text'] = df.apply(combine_columns, axis=1)
articles = df['combined_text'].tolist()

# SentenceTransformer for embedding
model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name, device=device)
embeddings = model.encode(articles, convert_to_tensor=True)  # This will use GPU if available

# FAISS Index creation
embedding_dim = embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(embedding_dim)
faiss_index.add(np.array(embeddings.cpu()))  # Ensure embeddings are on CPU for FAISS

# Initialize the HuggingFace embedding model for LangChain
hf_embeddings = HuggingFaceEmbeddings(model_name=model_name)

# Create Document objects for each article, adding a source identifier
documents = [Document(page_content=article, metadata={'source': f'Article {i}'}) for i, article in enumerate(articles)]

# Create the FAISS vector store from documents
vector_store = FAISS.from_documents(documents, hf_embeddings)

# Save the FAISS index and embeddings
torch.save(embeddings.cpu(), 'bbc_embeddings.pt')  # Ensure embeddings are on CPU for saving
faiss.write_index(faiss_index, 'bbc_faiss_index.faiss')

# Load GPT-2 model and tokenizer
model_name = "gpt2-large"  # Use GPT-2 large
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
gpt_model = GPT2LMHeadModel.from_pretrained(model_name)
gpt_model.to(device)  # Move the model to the appropriate device

# Define the pipeline for GPT-2 text generation
generative_pipeline = pipeline(
    "text-generation",
    model=gpt_model,
    tokenizer=tokenizer,
    max_length=300,  # Larger max_length for more detailed answers
    min_length=100,  # Ensure the answer isn't too short
    length_penalty=1.2,
    num_beams=4,
    truncation=True,
    no_repeat_ngram_size=2,  # Prevent repetitive answers
    do_sample=True,  # Sampling for creative answers
    device=0 if torch.cuda.is_available() else -1  # Adjust for GPU or CPU
)

#  function to retrieve relevant documents and generate an answer with article publish dates
def get_generative_answer(query, vector_store, num_docs=3):
    # Retrieve the top `num_docs` relevant documents using FAISS
    docs = vector_store.similarity_search(query, k=num_docs)

    # Summarize and combine the content of the retrieved documents into a structured context
    context = ""
    for doc in docs:
        article_text = doc.page_content[:300]  # Reduce the length of each document snippet
        publish_date = doc.metadata.get('source')  # Retrieve the 'source' metadata for the publish date
        context += f"Article published on {publish_date}:\n{article_text}\n\n"

    # Prepare the input prompt for GPT-2
    prompt = (
        f"Question: {query}\n"
        f"Context: Please provide a detailed answer based on the following articles. Include references to the publishing dates of the articles mentioned.\n"
        f"{context}\nAnswer:"
    )

    # Generate the answer using GPT-2 with improved generation control
    generated_text = generative_pipeline(
        prompt,
        max_length=600,  # Increase max_length for more detailed responses
        min_length=200,  # Ensure the answer has enough substance
        num_return_sequences=1,
        temperature=0.6,  # Lower temperature for more focused generation
        num_beams=6,  # Increase beams for more thoughtful generation
        no_repeat_ngram_size=2  # Prevent repetition
    )[0]['generated_text']

    return generated_text


# Example query
# Allow the user to input a query
user_query = input("Please enter your query: ")

# Get the answer based on the user's query
answer = get_generative_answer(user_query, vector_store)

# Print the generated answer
print("Answer:", answer)




Please enter your query: tell me about russia ukraine war


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Answer: Question: tell me about russia ukraine war
Context: Please provide a detailed answer based on the following articles. Include references to the publishing dates of the articles mentioned.
Article published on Article 26:
ukraine war kherson mariupol key russian success strategic historical reason offensive southern ukraine vital russia Published on: 04/03/2022 12:54

Article published on Article 302:
ukraine happened day russia invasion vladimir putin pave way foreign fighter join war russia widens attack Published on: 11/03/2022 21:12

Article published on Article 8256:
ukraine war happening russia total fear lithuania russian opposing vladimir putin war say home country like huge prison Published on: 22/09/2022 23:06


Answer: The war in Ukraine began on April 12, 2014, when Russia annexed the Crimean peninsula from Ukraine. The conflict has since claimed the lives of more than 6,000 people and displaced millions of others. Russia's annexation of Crimea has been condemned by 