In [430]:
import nltk
import pandas as pd
import numpy as np
import re
import string
import spacy
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import requests
from bs4 import BeautifulSoup
from io import StringIO
from PyPDF2 import PdfReader
from docx import Document


In [431]:
nltk.download("stopwords")
nltk.download("punkt_tab")
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\HP
[nltk_data]     EliteBook\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\HP
[nltk_data]     EliteBook\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\HP
[nltk_data]     EliteBook\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [432]:
stopwords_list = stopwords.words('english')
print(stopwords_list)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [433]:
english_stopset = set(stopwords.words('english')).union(
                  {"things", "that's", "something", "take", "don't", "may", "want", "you're",
                   "set", "might", "says", "including", "lot", "much", "said", "know",
                   "good", "step", "often", "going", "thing", "things", "think",
                   "back", "actually", "better", "look", "find", "right", "example",
                                                                  "verb", "verbs"})

In [434]:
def clean_text(text):
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)      # Remove non-ASCII characters
    text = re.sub(r'@\w+', '', text)                # Remove email mentions, user handles, etc.
    text = text.lower()                             # Lowercase text
    text = re.sub(r'[^\w\s]', ' ', text)            # Remove punctuation
    text = re.sub(r'[0-9]', '', text)               # Remove numbers
    text = re.sub(r'\s{2,}', ' ', text).strip()     # Remove extra spaces

    return text


In [435]:
def vectorize_text(docs):
    vectorizer = TfidfVectorizer(
        analyzer='word', 
        ngram_range=(1, 2), 
        min_df=0.002, 
        max_df=0.99, 
        max_features=10000, 
        lowercase=True, 
        stop_words=list(english_stopset))
    X = vectorizer.fit_transform(docs)
    return X, vectorizer


In [436]:
# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

In [437]:
def custom_lemmatize(word):
    # Dictionary of custom word transformations
    custom_lemmas = {
        'ethiopian': 'ethiopia',
        'ethiopia': 'ethiopia',}
    return custom_lemmas.get(word.lower(), word)

def lemmatize_text_spacy(text):
    doc = nlp(text)
    # Lemmatize each token and return the lemmatized text
    lemmatized_text = " ".join([custom_lemmatize(token.lemma_) for token in doc if token.text not in english_stopset])
    return lemmatized_text


In [438]:
def get_similar_articles(q, df, vectorizer, k, query, docs):
    print("Done Searching. Full Result: \n")
    print("searched items : ", query)
    print("Article with the Highest Cosine Similarity Values: ")
    print("----------------------------------------------------")
    top_results=5
    q = [q]

    q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0],)
    sim = {}

    for i in range(len(docs)):
        sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)  

    sim_sorted = sorted(sim.items(),key=lambda x : x[1], reverse=True)[:min(len(sim), top_results)]


    n = 0
    for i, v in sim_sorted:    # Print the articles and their similarity values
        if v != 0.0:
            print("Similaritas score: ", v)
        print(docs[i])
        print('\n')
        n += 1
        if n == k:
            break



In [439]:
# sample document example
docs = ['i loved you ethiopian, stored elements in Compress find Sparse Ethiopia is the greatest country in the world of nation at universe',

        'also, sometimes, the same words can have multiple different ‘lemma’s. So, based on the context it’s used, you should identify the \
        part-of-speech (POS) tag for the word in that specific context and extract the appropriate lemma. Examples of implementing this comes \
        in the following sections countries.ethiopia With a planned.The name that the Blue Nile river loved took in Ethiopia is derived from the \
        Geez word for great to imply its being the river of rivers The word Abay still exists in ethiopia major languages',

        'With more than  million people, ethiopia is the second most populous nation in Africa after Nigeria, and the fastest growing \
         economy in the region. However, it is also one of the poorest, with a per capita income',

        'The primary purpose of the dam ethiopia is electricity production to relieve Ethiopia’s acute energy shortage and for electricity export to neighboring\
         countries.ethiopia With a planned.',

        'The name that the Blue Nile river loved takes in Ethiopia "abay" is derived from the Geez blue loved word for great to imply its being the river of rivers The \
         word Abay still exists in Ethiopia major languages to refer to anything or anyone considered to be superior.',

        'Two non-upgraded loved turbine-generators with MW each are the first loveto go into operation with loved MW delivered to the national power grid. This early power\
         generation will start well before the completion']


In [440]:

cleaned_docs = [clean_text(doc) for doc in docs]
lemmatized_docs = [lemmatize_text_spacy(doc) for doc in cleaned_docs]
 
X, vectorizer = vectorize_text(lemmatized_docs)
k = 3


df = pd.DataFrame(X.T.toarray())


query1 = 'loved'
query2 = 'love'
q1 = lemmatize_text_spacy(query1)
q2 = lemmatize_text_spacy(query2)


get_similar_articles(q1, df, vectorizer, k, query1, docs)
get_similar_articles(q2, df, vectorizer, k, query2, docs)


Done Searching. Full Result: 

searched items :  loved
Article with the Highest Cosine Similarity Values: 
----------------------------------------------------
Similaritas score:  0.19797091359850721
Generator
Two non-upgraded loved turbine-generators with MW each are the first loveto go into operation with loved MW delivered to the national power grid. This early power         generation will start well before the completion


Similaritas score:  0.19507975074710232
Power Grid
The name that the Blue Nile river loved takes in Ethiopia "abay" is derived from the Geez blue loved word for great to imply its being the river of rivers The          word Abay still exists in Ethiopia major languages to refer to anything or anyone considered to be superior.


Similaritas score:  0.15545752122343945
Two upgraded
i loved you ethiopian, stored elements in Compress find Sparse Ethiopia is the greatest country in the world of nation at universe


Done Searching. Full Result: 

searched items :  lov