<a href="https://colab.research.google.com/github/ArpitRawat07/Query_Expansion/blob/main/Query_Expansion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import nltk
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from sklearn.neighbors import NearestCentroid
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import TruncatedSVD
from collections import Counter

# Loading the dataset containing news
news_df = pd.read_csv('BBCNews.csv')

In [None]:

# Define synonyms for the different types of news
synonyms = {
    'sports': ['sports', 'athletics', 'games', 'competitions', 'matches', 'cricket', 'football', 'soccer', 'hockey'],
    'politics': ['politics', 'government', 'elections', 'democracy', 'parliament', 'congress', 'senate', 'leadership'],
    'entertainment': ['entertainment', 'celebrities', 'movies', 'films', 'television', 'music', 'arts', 'culture'],
    'technology': ['technology', 'science', 'innovation', 'computers', 'gadgets', 'internet', 'artificial intelligence', 'cybersecurity'],
    'business': ['business', 'economy', 'finance', 'markets', 'investment', 'commerce', 'trade', 'industry']
}



In [None]:
# Downloading necessary NLTK resources
nltk.download('omw-1.4')#"Open Multilingual WordNet,"
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:

# Reading the CSV file and preprocessing the data
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    generatedTokens = nltk.word_tokenize(text)
    generatedTokens = [token.lower() for token in generatedTokens if token.isalpha()]
    generatedTokens = [lemmatizer.lemmatize(token) for token in generatedTokens if token not in stopwords.words('english')]
    return ' '.join(generatedTokens)

news_df['tags'] = news_df['tags'].astype(str)
news_df['tags'] = news_df['tags'].apply(lambda x: preprocess(x) if isinstance(x, str) else '')
news_df['descr'] = news_df['descr'].astype(str)
news_df['descr'] = news_df['descr'].apply(lambda x: preprocess(x) if isinstance(x, str) else '')



In [None]:
print(news_df.tags)

0       sport stamford bridge football association fif...
1       sport madrid birmingham france scotland united...
2       sport derby brazil tunnel fracasedu food footb...
3       sport bbc united kingdom ireland brian eddie r...
4       sport liverpool daily sport millennium stadium...
                              ...                        
2405    business zurich fiat reuters financial time ge...
2406    agroflora reuters vestey group venezuela unite...
2407    business jacksonville kraft food gim credit u ...
2408    environment business yangtze electric power ya...
2409    politics environment algiers el watan algeria ...
Name: tags, Length: 2410, dtype: object


In [None]:
# Define a function to map each category to its base type
def get_base_category(tag):
    tag_list = tag.split(' ')
    for t in tag_list:
      for base_category, syn_list in synonyms.items():
        if t.lower() in syn_list:
            return base_category
    return 'other'


In [None]:
from collections import Counter

def preprocess_news(news_df):
    # Remove null values
    news_df = news_df.dropna()
    # Map each tag to its base category
    news_df['tags'] = news_df['tags'].apply(get_base_category)
    print(news_df['tags'])

    # Creating an object of TF-IDF vectorizer
    tfidf_Vectorizer = TfidfVectorizer()

    # Fit + Transform the description
    X_train_vec = tfidf_Vectorizer.fit_transform(news_df['descr'])

    # Apply dimensionality reduction using Latent Semantic Analysis (LSA)
    lsa = TruncatedSVD(n_components=100, algorithm='arpack')
    X_train_lsa = lsa.fit_transform(X_train_vec)

    # Compute cosine similarity between all pairs of documents
    similarity_matrix = cosine_similarity(X_train_lsa)

    # Define a label encoder to convert categories to integers
    le = LabelEncoder()

    # Fit the label encoder on the unique categories in the dataset
    le.fit(news_df['tags'].unique())

    # Map each tag to its base category
    news_df['tags'] = news_df['tags'].apply(get_base_category)

    # Convert the categories to integers using the label encoder
    labels = le.transform(news_df['tags'].values)
    predicted_labels = []
    for i in range(len(similarity_matrix)):
        # Sort the similarity scores for the ith document in descending order and get the top k indices
        k = 10
        top_k_idx = similarity_matrix[i].argsort()[::-1][:k]
        # similarity_matrix[i].argsort()-> sorts the i'th row of similarity matrix
        # and then returns an array containing the indices of every element
        # (i.e. returns the indices of the sorted i'th row)
        # [::-1]->reverses the retrieved indexes(i.e. we get the indexes in descending order)
        # [:k]-> slicing and getting top k indices

        # Get the categories for the top k documents
        top_k_categories = labels[top_k_idx]

        # Find the most common category among the top k categories
        predicted_category = np.bincount(top_k_categories, minlength=len(synonyms)).argmax()
        predicted_labels.append(predicted_category)
    accuracy = (predicted_labels == labels).mean()
    print("Accuracy: ", accuracy * 100, "%")

    # User query
    query = input("Enter your query: ")

    # Transform the query using the same vectorizer
    query_vector = tfidf_Vectorizer.transform([query])

    # Calculate cosine similarities between the query vector and documents
    query_similarity = cosine_similarity(query_vector, X_train_vec)

    # Retrieve the indices of the top relevant documents
    top_k_docs_idx = np.argsort(-query_similarity.ravel())[:k]

    # Get the corresponding predicted labels for the top documents
    top_k_predicted_labels = [predicted_labels[i] for i in top_k_docs_idx]

    # Count the occurrences of predicted labels
    label_counts = Counter(top_k_predicted_labels)

    # Select the top N frequent labels as expanded queries
    num_expanded_queries = 10  # Define the number of expanded queries to display
    expanded_queries = label_counts.most_common(num_expanded_queries)

    # This loop prints the expanded queries along with the count of relevant news articles for each query. It also ensures that only unique news articles are printed by using a set to keep track of seen articles.
    print("Expanded Queries:")
    for query, count in expanded_queries:
        print(le.inverse_transform([query])[0], ":", count)
        print("Relevant News Articles:")
        unique_docs = set()
        for idx in top_k_docs_idx:
            if predicted_labels[idx] == query and news_df['descr'].iloc[idx] not in unique_docs:
                unique_docs.add(news_df['descr'].iloc[idx])
                print(news_df['descr'].iloc[idx])
                print('-' * 50)

    # Splitting the labels into training and testing sets
    train_idx = np.random.choice(len(labels), int(len(labels)*0.8), replace=False)
    test_idx = np.setdiff1d(range(len(labels)), train_idx)
    X_train = similarity_matrix[train_idx, :]
    y_train = labels[train_idx]
    X_test = similarity_matrix[test_idx, :]
    y_test = labels[test_idx]

    # Fit a Nearest Centroid classifier to the training data
    clfr = NearestCentroid()
    clfr.fit(X_train, y_train)

    # Test the classifier on new data
    yPred = clfr.predict(X_test)

    # Evaluating the performance of the classifier using y_test and y_pred
    print(classification_report(y_test, yPred))

    # Calculate the accuracy of the classifier
    finalAccuracy = accuracy_score(y_test, yPred)
    print("Accuracy: ", finalAccuracy*100, "%")


In [None]:

# # Clean and preprocess the news data
# def preprocess_news(news_df):
#     # Remove null values
#     news_df = news_df.dropna()

#     # Map each tag to its base category
#     news_df['tags'] = news_df['tags'].apply(get_base_category)
#     print(news_df['tags'])
#     # Creating an object of TF-IDF vectorizer
#     tfidf_Vectorizer = TfidfVectorizer()

#     # Fit + Transform the description
#     X_train_vec = tfidf_Vectorizer.fit_transform(news_df['descr'])

#     # Apply dimensionality reduction using Latent Semantic Analysis (LSA)
#     lsa = TruncatedSVD(n_components=100, algorithm='arpack')
#     X_train_lsa = lsa.fit_transform(X_train_vec)

#     # Compute cosine similarity between all pairs of documents
#     similarity_matrix = cosine_similarity(X_train_lsa)


#     # Define a label encoder to convert categories to integers
#     le = LabelEncoder()

#     # Fit the label encoder on the unique categories in the dataset
#     le.fit(news_df['tags'].unique())

#     # Map each tag to its base category
#     news_df['tags'] = news_df['tags'].apply(get_base_category)

#     # Convert the categories to integers using the label encoder
#     labels = le.transform(news_df['tags'].values)
#     predicted_labels = []
#     for i in range(len(similarity_matrix)):
#         # Sort the similarity scores for the ith document in descending order and get the top k indices
#         k = 10
#         top_k_idx = similarity_matrix[i].argsort()[::-1][:k]
        # similarity_matrix[i].argsort()-> sorts the i'th row of similarity matrix
        # and then returns an array containing the indices of every element
        # (i.e. returns the indices of the sorted i'th row)
        # [::-1]->reverses the retrieved indexes(i.e. we get the indexes in descending order)
        # [:k]-> slicing and getting top k indices

        # Get the categories for the top k documents
#         top_k_categories = labels[top_k_idx]
#         # Find the most common category among the top k categories
#         predicted_category = np.bincount(top_k_categories, minlength=len(synonyms)).argmax()

#         predicted_labels.append(predicted_category)

#     # Calculating the accuracy of the model before rochio's feedback
#     accuracy = (predicted_labels == labels).mean()
#     # print(f"Accuracy: {accuracy}")
#     print("Accuracy: ",accuracy*100,"%")
#     print("------------------------------------------------------------------------------")


#     # Splitting the labels into training and testing sets
#     train_idx = np.random.choice(len(labels), int(len(labels)*0.8), replace=False)
#     test_idx = np.setdiff1d(range(len(labels)), train_idx)
#     X_train = similarity_matrix[train_idx, :]
#     y_train = labels[train_idx]
#     X_test = similarity_matrix[test_idx, :]
#     y_test = labels[test_idx]

#     # Fit a Nearest Centroid classifier to the training data
#     clfr = NearestCentroid()
#     clfr.fit(X_train, y_train)

#     # Test the classifier on new data
#     yPred = clfr.predict(X_test)

#     # Evaluating the performance of the classifier using y_test and y_pred
#     print(classification_report(y_test, yPred))


#     # Calculate the accuracy of the classifier
#     finalAccuracy = accuracy_score(y_test, yPred)
#     print("Accuracy: ",finalAccuracy*100,"%")



In [None]:
preprocess_news(news_df)

0         sports
1         sports
2         sports
3          other
4         sports
          ...   
2405    business
2406    politics
2407    business
2408    business
2409    politics
Name: tags, Length: 2410, dtype: object
Accuracy:  88.09128630705393 %
Enter your query: football
Expanded Queries:
sports : 9
Relevant News Articles:
legendary dutch bos michels dy legendary dutch coach rinus michels man credited developing total football died aged referred netherlands general michels led dutch world cup reached final lose germany however guided side european championship title win soviet union final michels played ajax coached side four national title european cup dutch team built around johan cruyff johan neeskens introduced concept total football world strategy foster team coherence individual imagination player possessing skill play part pitch cruyff onfield organiser team whose player rotated defence encouraged play creative attacking football michels recently undergone heart sur