In [None]:
# a basic TF-IDF approach and the pandas library for data manipulation:
# which will search for the top 5 most similar articles to a given query.

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Load your CSV data
data = pd.read_csv('articles.csv')

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data['AB'].fillna(''))

# Function to search for queries
def search(query, tfidf_matrix, data):
    query_vector = tfidf_vectorizer.transform([query])
    cosine_similarities = linear_kernel(query_vector, tfidf_matrix).flatten()
    document_scores = list(enumerate(cosine_similarities))
    document_scores = sorted(document_scores, key=lambda x: x[1], reverse=True)
    
    # Return the top N results
    top_results = document_scores[:5]
    for idx, score in top_results:
        print(f"Title: {data['TI'].iloc[idx]}, Score: {score}")


In [None]:
# Example query
search("IQ scores", tfidf_matrix, data)

In [None]:
# Search engine with capability of the spell check and correcting the misspelling

import pandas as pd
from spellchecker import SpellChecker
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import nltk
from nltk.corpus import stopwords

# Load your dataset (replace 'your_dataset.csv' with the actual file path)
dataset_path = 'articles.csv'
df = pd.read_csv(dataset_path)


nltk.download('stopwords')

# Use English stopwords from NLTK
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    tokens = text.split()
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_tokens)

# Modify the preprocessing step to remove stopwords
df['AB'] = df['AB'].astype(str).apply(lambda x: x.lower())
df['AB'] = df['AB'].apply(remove_stopwords)


# Create a TF-IDF vectorizer for character-level embeddings
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 3))
tfidf_matrix = vectorizer.fit_transform(df['AB'])

# SpellChecker initialization
spell = SpellChecker()

def correct_spelling(query):
    # Tokenize the query
    tokens = query.split()

    # Correct misspelled words using pyspellchecker
    corrected_tokens = [spell.correction(token) for token in tokens]

    # Join the corrected tokens back into a corrected query
    corrected_query = ' '.join(corrected_tokens)

    return corrected_query

def extract_answer_sentence(query, abstract):
    # Tokenize the query
    query_tokens = query.lower().split()

    # Use regex to split the abstract into sentences
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', abstract)

    # Find the first sentence containing any of the query keywords
    for sentence in sentences:
        if any(token in sentence.lower() for token in query_tokens):
            return sentence

    return None


def search_engine(query, df, tfidf_matrix, num_results=5):
    # Correct misspellings in the query
    corrected_query = correct_spelling(query)

    # Vectorize the corrected query using the same TF-IDF vectorizer
    query_vector = vectorizer.transform([corrected_query])

    # Calculate cosine similarity between the query and dataset abstracts
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

    # Get the indices of the top N most similar abstracts
    top_indices = similarities.argsort()[-num_results:][::-1]

    # Return the titles, scores, and answer sentences of the top N most relevant articles
    top_articles = []
    for i in top_indices:
        title = df.iloc[i]['TI']
        score = similarities[i]
        abstract = df.iloc[i]['AB']
        answer_sentence = extract_answer_sentence(query, abstract)
        top_articles.append((title, score, answer_sentence))

    return top_articles

# Example usage
query = "What is the treatment for cancer?"
top_articles = search_engine(query, df, tfidf_matrix)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")


In [None]:
# Search engine with capability of the spell check and correcting the misspelling using TF-IDF vectorizer for character-level embeddings
# as well as enriching the search by adding synonyms for the nouns 

import pandas as pd
from spellchecker import SpellChecker
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import spacy

# # Load spaCy English language model
# nlp = spacy.load("en_core_web_sm")

# Download NLTK resources
# import nltk
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')

# Load your dataset (replace 'your_dataset.csv' with the actual file path)
dataset_path = 'articles.csv'
df = pd.read_csv(dataset_path)

# Preprocess the dataset
df['AB'] = df['AB'].astype(str).apply(lambda x: x.lower())

# Create a TF-IDF vectorizer for character-level embeddings
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 3))
tfidf_matrix = vectorizer.fit_transform(df['AB'])

# SpellChecker initialization
spell = SpellChecker()

# NLTK WordNet synonym extraction
def get_synonyms(word, pos=None):
    # # Map spaCy POS tags to WordNet POS tags
    # pos_mapping = {'NOUN': 'n', 'PROPN': 'n', 'VERB': 'v'}
    # Map POS tags to WordNet POS tags
    pos_mapping = {'NN': 'n', 'VB': 'v'}
    pos_tag = pos_mapping.get(pos, 'n') if pos else None
    
    synonyms = set()
    for syn in wordnet.synsets(word, pos=pos_tag):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return list(synonyms)

# Function to replace words in a sentence with their synonyms
def replace_with_synonyms(sentence, pos_tags, max_synonyms=1):
    tokens = word_tokenize(sentence)
    for i in range(len(tokens)):
        word = tokens[i]
        # Check if the word has a corresponding POS tag
        pos_tag_word = pos_tags[i][1] if i < len(pos_tags) else None
        # If the word has a specific POS tag (e.g., noun), get synonyms
        if pos_tag_word and pos_tag_word.startswith(('NN')):
            corrected_word = spell.correction(word)
            synonyms = get_synonyms(corrected_word, pos=pos_tag_word[:2])  # Use the first two characters of the tag
            print(synonyms)
            if synonyms:
                # Replace the word with up to max_synonyms synonyms
                tokens[i] = ' '.join(synonyms[:max_synonyms])
    return ' '.join(tokens)

# # Function to replace words in a sentence with their synonyms
# def replace_with_synonyms(sentence, max_synonyms=1):
#     tokens = word_tokenize(sentence)
    
#     # Use spaCy for part-of-speech tagging
#     pos_tags = [(token.text, token.pos_) for token in nlp(sentence)]
    
#     for i in range(len(tokens)):
#         word = tokens[i]
#         # Check if the word has a corresponding POS tag
#         pos_tag_word = pos_tags[i][1] if i < len(pos_tags) else None
#         # If the word has a specific POS tag (e.g., noun), get synonyms
#         if pos_tag_word in ['NOUN']:
#             corrected_word = spell.correction(word)
#             synonyms = get_synonyms(corrected_word, pos=pos_tag_word[:2])
#             print(synonyms)
#             if synonyms:
#                 # Replace the word with up to max_synonyms synonyms
#                 tokens[i] = ' '.join(synonyms[:max_synonyms])
#     return ' '.join(tokens)

def correct_spelling(query):
    # Tokenize the query
    tokens = query.split()

    # Correct misspelled words using pyspellchecker
    corrected_tokens = [spell.correction(token) for token in tokens]

    # Join the corrected tokens back into a corrected query
    corrected_query = ' '.join(corrected_tokens)

    return corrected_query

def extract_answer_sentence(query, abstract):
    # Tokenize the query
    query_tokens = query.lower().split()

    # Use regex to split the abstract into sentences
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', abstract)

    # Find the first sentence containing any of the query keywords
    for sentence in sentences:
        if any(token in sentence.lower() for token in query_tokens):
            return sentence

    return None

def search_engine(query, df, tfidf_matrix, num_results=5, use_synonyms=False, max_synonyms=1):
    # Correct misspellings in the query
    corrected_query = correct_spelling(query)

    # Optionally replace words with synonyms
    if use_synonyms:
        # Perform part-of-speech tagging
        pos_tags = pos_tag(word_tokenize(corrected_query))
        enriched_query = replace_with_synonyms(corrected_query, pos_tags, max_synonyms=max_synonyms)
    else:
        enriched_query = corrected_query

    # Vectorize the enriched query using the same TF-IDF vectorizer
    query_vector = vectorizer.transform([enriched_query])

    # Calculate cosine similarity between the query and dataset abstracts
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

    # Get the indices of the top N most similar abstracts
    top_indices = similarities.argsort()[-num_results:][::-1]

    # Return the titles, scores, and answer sentences of the top N most relevant articles
    top_articles = []
    for i in top_indices:
        title = df.iloc[i]['TI']
        score = similarities[i]
        abstract = df.iloc[i]['AB']
        answer_sentence = extract_answer_sentence(query, abstract)
        top_articles.append((title, score, answer_sentence))

    return top_articles

# Example usage
query = "What is the treatment for cancer?"
use_synonyms = True  # Set this flag to control whether to use synonyms or not
max_synonyms = 2  # Set the maximum number of synonyms to use for each word
top_articles = search_engine(query, df, tfidf_matrix, use_synonyms=use_synonyms, max_synonyms=max_synonyms)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")


In [None]:
# without using synonyms but with misspelling
query = "What is the treatent for lung caner?"
top_articles = search_engine(query, df, tfidf_matrix)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")

print("="*100)
# without using synonyms 
query = "What is the treatment for lung cancer?"
top_articles = search_engine(query, df, tfidf_matrix)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")

print("="*100)
# using synonyms and misspelling
query = "What is the treatment for lung cancer?"
top_articles = search_engine(query, df, tfidf_matrix, use_synonyms=True)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")


In [None]:
# Search engine with capability of the spell check and correcting the misspelling using TF-IDF vectorizer for character-level embeddings
# and edit distance metric-based approach using the Levenshtein distance algorithm
# as well as enriching the search by adding synonyms for the nouns 

import pandas as pd
from spellchecker import SpellChecker
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import Levenshtein as lev

# Load your dataset (replace 'your_dataset.csv' with the actual file path)
dataset_path = 'articles.csv'
df = pd.read_csv(dataset_path)

# Preprocess the dataset
df['AB'] = df['AB'].astype(str).apply(lambda x: x.lower())

# Create a TF-IDF vectorizer for character-level embeddings
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 3))
tfidf_matrix = vectorizer.fit_transform(df['AB'])

# NLTK WordNet synonym extraction
def get_synonyms(word, pos=None):
    pos_mapping = {'NN': 'n', 'VB': 'v'}
    pos_tag = pos_mapping.get(pos, 'n') if pos else None
    
    synonyms = set()
    for syn in wordnet.synsets(word, pos=pos_tag):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return list(synonyms)

# Function to replace words in a sentence with their synonyms
def replace_with_synonyms(sentence, pos_tags, max_synonyms=1):
    tokens = word_tokenize(sentence)
    for i in range(len(tokens)):
        word = tokens[i]
        pos_tag_word = pos_tags[i][1] if i < len(pos_tags) else None
        if pos_tag_word and pos_tag_word.startswith(('NN')):
            corrected_word = spell.correction(word)
            synonyms = get_synonyms(corrected_word, pos=pos_tag_word[:2])
            if synonyms:
                tokens[i] = ' '.join(synonyms[:max_synonyms])
    print(' '.join(tokens))
    return ' '.join(tokens)

def correct_spelling_edit_distance(query):
    tokens = query.split()
    corrected_tokens = [correct_with_edit_distance(token) for token in tokens]
    corrected_query = ' '.join(corrected_tokens)
    return corrected_query

def correct_with_edit_distance(token):
    # Get candidate corrections within a maximum edit distance
    candidates = [word for word in vocabulary if lev.distance(token, word) <= max_edit_distance]
    
    # Choose the candidate with the minimum edit distance
    corrected_token = min(candidates, key=lambda x: lev.distance(token, x))
    
    return corrected_token

def extract_answer_sentence(query, abstract):
    query_tokens = query.lower().split()
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', abstract)
    
    for sentence in sentences:
        if any(token in sentence.lower() for token in query_tokens):
            return sentence
    
    return None

def search_engine(query, df, tfidf_matrix, num_results=5, use_synonyms=False, max_synonyms=1):
    corrected_query = correct_spelling_edit_distance(query)
    
    if use_synonyms:
        pos_tags = pos_tag(word_tokenize(corrected_query))
        enriched_query = replace_with_synonyms(corrected_query, pos_tags, max_synonyms=max_synonyms)
    else:
        enriched_query = corrected_query
    
    query_vector = vectorizer.transform([enriched_query])
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    top_indices = similarities.argsort()[-num_results:][::-1]
    
    top_articles = []
    for i in top_indices:
        title = df.iloc[i]['TI']
        score = similarities[i]
        abstract = df.iloc[i]['AB']
        answer_sentence = extract_answer_sentence(query, abstract)
        top_articles.append((title, score, answer_sentence))
    
    return top_articles

# Example usage
query = "What is the treatment for cancer?"
use_synonyms = True
max_synonyms = 2
max_edit_distance = 2  # Set the maximum edit distance for the spell-checking
spell = SpellChecker(distance=max_edit_distance)
vocabulary = set(df['AB'].str.cat(sep=' ').lower().split())

top_articles = search_engine(query, df, tfidf_matrix, use_synonyms=use_synonyms, max_synonyms=max_synonyms)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")


In [None]:
# without using synonyms but with misspelling
query = "What is the treatent for lung caner?"
top_articles = search_engine(query, df, tfidf_matrix)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")

print("="*100)
# without using synonyms 
query = "What is the treatment for lung cancer?"
top_articles = search_engine(query, df, tfidf_matrix)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")

print("="*100)
# using synonyms and misspelling
query = "What is the treatment for lung cancer?"
top_articles = search_engine(query, df, tfidf_matrix, use_synonyms=True)
for title, score, answer_sentence in top_articles:
    print(f"Title: {title}, Score: {score}")
    print(f"Answer Sentence: {answer_sentence}\n")