In [64]:
!pip install wikipedia-api nltk spacy scikit-learn

zsh:1: command not found: pip


In [65]:
!python3 -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m01[0m
[?25h
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [66]:
#Make a function to make requests to wikipedia
import wikipediaapi

def get_wiki(pageName):
    wiki = wikipediaapi.Wikipedia(
    language="en",
    extract_format=wikipediaapi.ExtractFormat.WIKI,
    user_agent="'nlpProject(castr385@umn.edu)"
    )
    
    page = wiki.page(pageName)
    if page.exists():
        return page.text
    else:
        print("Page Not Found")
        
#Test the function / api
# title = "Albert Einstein"
# content = get_wiki(title)
# print(content)

In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Download NLTK resources
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download('averaged_perceptron_tagger_eng')

# Function to map NLTK POS tags to WordNet POS tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ  # Adjective
    elif tag.startswith('V'):
        return wordnet.VERB  # Verb
    elif tag.startswith('N'):
        return wordnet.NOUN  # Noun
    elif tag.startswith('R'):
        return wordnet.ADV  # Adverb
    else:
        return wordnet.NOUN  # Default to noun

# Main text processing function
def process_text(text):
    # Step 1: Sentence Segmentation
    sentences = sent_tokenize(text)

    # Step 2: Tokenization
    tokens = [word_tokenize(sentence) for sentence in sentences]
    # print("Tokens:", tokens)

    # Step 3: Stop Word Removal
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [
        [word for word in sentence if word.lower() not in stop_words and word.isalpha()]
        for sentence in tokens
    ]
    # print("Filtered Tokens:", filtered_tokens)

    # Step 4: POS Tagging and Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = []
    for sentence in filtered_tokens:
        pos_tagged = nltk.pos_tag(sentence)  # POS tagging
        lemmatized_sentence = [
            lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tagged
        ]
        lemmatized_tokens.append(lemmatized_sentence)
    
    # print("Lemmatized Tokens:", lemmatized_tokens)
    return lemmatized_tokens

# Test the process_text function
# sample_text = "Albert Einstein was a physicist. He developed the theory of relativity."
# processed = process_text(sample_text)
# print(processed)

[['Albert', 'Einstein', 'physicist'], ['develop', 'theory', 'relativity']]


[nltk_data] Downloading package punkt to /Users/castro/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/castro/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/castro/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/castro/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [68]:
#Dependency Parsing to analyze grammatical relationships between words
import spacy

nlp = spacy.load("en_core_web_sm")

def parse_dependencies(text1):
    doc = nlp(text1)
    print("Dependency Parsing:")
    for token in doc:
        print(f"{token.text} -> {token.dep_} -> {token.head.text}")
# text = "Albert Einstein developed the theory of relativity."
# parse_dependencies(text)

In [69]:
#calculate semantic value
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def similarity_calculator(doc1, doc2):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform([doc1, doc2])
    similarity_score = cosine_similarity(X[0:1], X[1:2])
    return similarity_score[0][0]

# doc1 = "Albert Einstein was a physicist who developed the theory of relativity."
# doc2 = "Isaac Newton was a mathematician who developed the laws of motion."
# similarity = similarity_calculator(doc1, doc2)
# print(f"Similarity Score: {similarity:.2f}")

In [None]:
def main():
    print("Welcome to the Wikipedia Text Similarity Tool!")
    
    # Input Wikipedia Titles
    title1 = input("Enter the title of the first Wikipedia page: ")
    title2 = input("Enter the title of the second Wikipedia page: ")
    
    try:
        # Fetch Wikipedia Content
        text1 = get_wiki(title1)
        text2 = get_wiki(title2)

        # Print the first 500 characters of each text for sanity check
        print(f"\nFirst 50 characters of Text 1 ({title1}):\n{text1[:50]}")
        print(f"\nFirst 50 characters of Text 2 ({title2}):\n{text2[:50]}")
        
        # Preprocess the Texts
        processed_text1 = process_text(text1)
        processed_text2 = process_text(text2)

        # Flatten Tokenized Sentences for Similarity Calculation
        flat_text1 = " ".join([" ".join(sentence) for sentence in processed_text1])
        flat_text2 = " ".join([" ".join(sentence) for sentence in processed_text2])
        
        # Dependency Parsing on Raw Text (Optional)
        print("\nDependency Parsing for Text 1:")
        parse_dependencies(text1[:50])  # Parse the first 1000 characters of raw text
        
        print("\nDependency Parsing for Text 2:")
        parse_dependencies(text2[:50])  # Parse the first 1000 characters of raw text
        
        # Calculate Similarity
        similarity_score = similarity_calculator(flat_text1, flat_text2)
        print(f"\nSimilarity Score between the two pages: {similarity_score:.2f}")
    
    except ValueError as e:
        print(e)

if __name__ == "__main__":
    main()

