<a href="https://colab.research.google.com/github/Chinni-Akanksha01/NLP/blob/main/NLP_ASSIGNMENT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# Install the spellchecker library
!pip install pyspellchecker

from spellchecker import SpellChecker

def correct_query(query):
    spell = SpellChecker()

    # Add custom words to prioritize certain corrections
    spell.word_frequency.load_words(['weather', 'tomorrow', 'new york'])

    words = query.split()

    # Correct each word if it's misspelled
    corrected_words = []
    for word in words:
        if word.lower() == "wether":  # Explicit rule for this common mistake
            corrected_words.append("weather")
        else:
            corrected_words.append(spell.correction(word) if word in spell.unknown(words) else word)

    # Join the corrected words and ensure proper capitalization
    corrected_query = ' '.join(corrected_words).capitalize()
    return corrected_query

query = "wether tommorrow in new yrok"
corrected_query = correct_query(query)
print(corrected_query)
# Output: "Weather tomorrow in new york"


Weather tomorrow in new york


In [8]:
import re

# Dictionary of known words for segmentation
DICTIONARY = set([
    "best", "day", "ever", "get", "started", "now", "example", "com",
    "deterministic", "url", "hashtag", "segmentation"
])

def segment_text(text, dictionary):
    """
    Segments text deterministically using a dictionary of words.
    """
    text = text.lower()  # Case normalization
    segments = []
    while text:
        match_found = False
        for i in range(len(text), 0, -1):
            prefix = text[:i]
            if prefix in dictionary:
                segments.append(prefix)
                text = text[i:]
                match_found = True
                break
        if not match_found:  # If no match, consider the first character as a word
            segments.append(text[0])
            text = text[1:]
    return segments

def split_hashtag_or_url(input_string):
    """
    Splits hashtags or URLs deterministically.
    """
    # Remove special characters for hashtags
    if input_string.startswith('#'):
        input_string = input_string[1:]

    # For URLs, strip the protocol
    if input_string.startswith(('http://', 'https://')):
        input_string = re.sub(r'^https?://', '', input_string)

    # Split based on non-alphanumeric characters
    parts = re.split(r'[\W_]', input_string)
    segments = []
    for part in parts:
        if part:  # Skip empty parts
            segments.extend(segment_text(part, DICTIONARY))
    return segments

# Examples
hashtag = "#BestDayEver"
url = "http://example.com/GetStartedNow"

print("Hashtag Segmentation:", split_hashtag_or_url(hashtag))
print("URL Segmentation:", split_hashtag_or_url(url))


Hashtag Segmentation: ['best', 'day', 'ever']
URL Segmentation: ['example', 'com', 'get', 'started', 'now']


In [12]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize

# Ensure necessary NLTK data is downloaded
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

def get_sense(word, context):
    """
    Disambiguates the sense of a word based on its context using WordNet.
    """
    # Tokenize the context
    context_tokens = set(word_tokenize(context.lower()))

    # Get all synsets (senses) for the word
    synsets = wn.synsets(word)

    if not synsets:
        return None  # No senses found in WordNet

    best_sense = None
    max_overlap = 0

    for synset in synsets:
        # Get the definition, examples, and related terms for the synset
        definition = synset.definition()
        examples = synset.examples()
        related_terms = set()

        # Include synonyms and hypernyms for better matching
        related_terms.update(lemma.name() for lemma in synset.lemmas())
        related_terms.update(hypernym.name().split('.')[0] for hypernym in synset.hypernyms())

        # Combine definition, examples, and related terms
        sense_words = set(word_tokenize(definition.lower()))
        for example in examples:
            sense_words.update(word_tokenize(example.lower()))
        sense_words.update(related_terms)

        # Calculate overlap between sense words and context tokens
        overlap = len(sense_words & context_tokens)
        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = synset

    return best_sense

# Example usage
word = "mouse"
context1 = "The mouse ran across the field to avoid the cat."
context2 = "I bought a new wireless mouse for my computer."

sense1 = get_sense(word, context1)
sense2 = get_sense(word, context2)

print("Context 1 Sense:", sense1)
print("Definition:", sense1.definition() if sense1 else "No definition found")
print()
print("Context 2 Sense:", sense2)
print("Definition:", sense2.definition() if sense2 else "No definition found")


Context 1 Sense: Synset('shiner.n.01')
Definition: a swollen bruise caused by a blow to the eye

Context 2 Sense: Synset('mouse.n.04')
Definition: a hand-operated electronic device that controls the coordinates of a cursor on your computer screen as you move it around on a pad; on the bottom of the device is a ball that rolls on the surface of the pad


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
!pip install langdetect langid

# Import libraries
from langdetect import detect, detect_langs
import langid

# Example text samples
text1 = "The quick brown fox jumps over the lazy dog."  # English
text2 = "El zorro marrón rápido salta sobre el perro perezoso."  # Spanish
text3 = "快速的棕色狐狸跳过了懒狗。"  # Chinese

# Using Langdetect
print("Langdetect:")
print(f"Text 1: {text1} -> {detect(text1)}")
print(f"Text 2: {text2} -> {detect(text2)}")
print(f"Text 3: {text3} -> {detect(text3)}")

# Using Langid
print("\nLangid:")
print(f"Text 1: {text1} -> {langid.classify(text1)[0]}")
print(f"Text 2: {text2} -> {langid.classify(text2)[0]}")
print(f"Text 3: {text3} -> {langid.classify(text3)[0]}")


Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting langid
  Downloading langid-1.1.6.tar.gz (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect, langid
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993222 sha256=edd646d0a496fc3bd767db9527acf1079cd2675ee00d4b886a3c1fed9f584483
  Stored in directory: /root/.cache/pip/wheels/0a/f2/b2/e5ca405801e05eb7c8ed5b3b4bcf1fcabcd6272c167640072e
  Building wheel for langid (setup.py) ... [?25l[?25hdone
  Created wheel for langid: filename=langid-1.1.6-py3-none-an

In [14]:
import re

# Contraction patterns and possessive fixes
contraction_patterns = {
    "dont": "don't",
    "cant": "can't",
    "isnt": "isn't",
    "wont": "won't",
    "didnt": "didn't",
    "hasnt": "hasn't",
    "havent": "haven't",
    "doesnt": "doesn't",
    "arent": "aren't",
    "shouldnt": "shouldn't",
    "wouldnt": "wouldn't",
    "couldnt": "couldn't",
    "im": "I'm",
    "youre": "you're",
    "its": "it's",  # Special handling for 'its' vs. 'it's'
}

def fix_contractions(text):
    """
    Fixes common contractions by adding missing apostrophes.
    """
    words = text.split()
    corrected_words = []

    for word in words:
        word_lower = word.lower()
        if word_lower in contraction_patterns:
            corrected_words.append(contraction_patterns[word_lower])
        else:
            corrected_words.append(word)

    return " ".join(corrected_words)

def fix_possessives(text):
    """
    Fix possessive forms by adding apostrophes where needed.
    """
    # Simple possessive fixes, e.g., dogs -> dog's
    text = re.sub(r"(\b\w+)(s)\b", r"\1's", text)
    return text

def correct_apostrophes(text):
    """
    Correct both contractions and possessive forms in the text.
    """
    text = fix_contractions(text)
    text = fix_possessives(text)
    return text

# Example usage
text = "The dogs didnt know what to do with its ball. She cant believe its happening."
corrected_text = correct_apostrophes(text)
print("Original Text:", text)
print("Corrected Text:", corrected_text)


Original Text: The dogs didnt know what to do with its ball. She cant believe its happening.
Corrected Text: The dog's didn't know what to do with it's ball. She can't believe it's happening.


In [15]:
!pip install wordninja
import wordninja

# Function to segment hashtags
def segment_hashtag(hashtag):
    # Remove the '#' symbol before segmentation
    hashtag_text = hashtag.lstrip('#')

    # Use wordninja to split the hashtag into words
    segmented = wordninja.split(hashtag_text)

    # Rebuild the hashtag with spaces between words
    return ' '.join(segmented)

# Example usage
hashtags = ['#MachineLearningIsAwesome', '#DeepLearningRocks', '#AIRevolution']

segmented_hashtags = [segment_hashtag(tag) for tag in hashtags]

for original, segmented in zip(hashtags, segmented_hashtags):
    print(f"Original: {original} -> Segmented: {segmented}")


Collecting wordninja
  Downloading wordninja-2.0.0.tar.gz (541 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/541.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/541.6 kB[0m [31m6.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m541.6/541.6 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wordninja
  Building wheel for wordninja (setup.py) ... [?25l[?25hdone
  Created wheel for wordninja: filename=wordninja-2.0.0-py3-none-any.whl size=541530 sha256=7f41dca4eab33e295bf2a5910b5b89d4529a2e8690985abc3baa2e2be2f5affa
  Stored in directory: /root/.cache/pip/wheels/e6/66/9c/712044a983337f5d44f90abcd244bd4b8ad28ee64750404b50
Successfully built wordninja
Installing collected packages: wordninja
Successfully installed wordninja-2.0.0
Original: #

In [16]:
import re

# Define a dictionary of acronyms
acronym_dict = {
    'AI': 'Artificial Intelligence',
    'ML': 'Machine Learning',
    'NLP': 'Natural Language Processing',
    'CNN': 'Convolutional Neural Network',
    'RNN': 'Recurrent Neural Network',
    'API': 'Application Programming Interface',
    'IoT': 'Internet of Things',
    'SQL': 'Structured Query Language',
    'GPU': 'Graphics Processing Unit',
    'CPU': 'Central Processing Unit',
    'VPN': 'Virtual Private Network',
    'USB': 'Universal Serial Bus',
    'HTTP': 'HyperText Transfer Protocol',
    'HTTPS': 'HyperText Transfer Protocol Secure',
    'JSON': 'JavaScript Object Notation',
    'XML': 'Extensible Markup Language'
}

# Function to expand acronyms with regex
def expand_acronyms_regex(text):
    def replace(match):
        acronym = match.group(0).upper()  # Get matched acronym
        return acronym_dict.get(acronym, acronym)  # Replace with expanded form

    # Regex to find all uppercase acronyms of length 2 or more
    pattern = r'\b[A-Z]{2,}\b'
    return re.sub(pattern, replace, text)

# Example usage
input_text = "I am learning AI and ML. The API for this NLP model is simple."
expanded_text = expand_acronyms_regex(input_text)

print("Original Text:", input_text)
print("Expanded Text:", expanded_text)


Original Text: I am learning AI and ML. The API for this NLP model is simple.
Expanded Text: I am learning Artificial Intelligence and Machine Learning. The Application Programming Interface for this Natural Language Processing model is simple.


In [22]:
# Install the spellchecker library
!pip install pyspellchecker

from spellchecker import SpellChecker

def correct_query(query):
    spell = SpellChecker()

    # Add custom words to prioritize certain corrections
    spell.word_frequency.load_words(['weather', 'tomorrow', 'new york'])

    words = query.split()

    # Correct each word if it's misspelled
    corrected_words = []
    for word in words:
        if word.lower() == "wether":  # Explicit rule for this common mistake
            corrected_words.append("weather")
        else:
            corrected_words.append(spell.correction(word) if word in spell.unknown(words) else word)

    # Join the corrected words and ensure proper capitalization
    corrected_query = ' '.join(corrected_words).capitalize()
    return corrected_query

query = "wether tommorrow in new yrok"
corrected_query = correct_query(query)
print(corrected_query)
# Output: "Weather tomorrow in new york"


Weather tomorrow in new york


In [18]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Initialize the stemmer
stemmer = PorterStemmer()

# Example text
text = "Hello! I'm learning Natural Language Processing (NLP), it's amazing. #AI #MachineLearning"

# Function to clean and process the text
def process_text(text):
    # Remove special characters and numbers (keeping only words and spaces)
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize the text into words
    tokens = word_tokenize(cleaned_text)

    # Convert all tokens to lowercase
    tokens = [word.lower() for word in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Stem the words
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

    return stemmed_tokens

# Process the text
processed_text = process_text(text)

# Print the results
print("Original Text:", text)
print("Processed Text:", processed_text)


Original Text: Hello! I'm learning Natural Language Processing (NLP), it's amazing. #AI #MachineLearning
Processed Text: ['hello', 'im', 'learn', 'natur', 'languag', 'process', 'nlp', 'amaz', 'ai', 'machinelearn']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
import re

def resolve_pronouns(text, entities):
    # Extract all pronouns and their positions (modified to work with ** for pronouns)
    pronoun_pattern = r'\*\*(\w+)\*\*'  # Looking for **pronoun**
    pronouns = [(match.group(1), match.start()) for match in re.finditer(pronoun_pattern, text)]

    # Clean the text by removing ** markers
    clean_text = re.sub(r'\*\*(\w+)\*\*', r'\1', text)

    # Initialize a list to store the resolved entities
    resolved = []

    # For each pronoun, find the corresponding entity
    for pronoun, pos in pronouns:
        closest_entity = None
        closest_distance = float('inf')

        # Iterate through all entities to find the best match for the pronoun
        for entity in entities:
            entity_pos = clean_text.rfind(entity, 0, pos)  # Find the last occurrence of the entity before the pronoun
            if entity_pos != -1:
                distance = pos - (entity_pos + len(entity))
                if distance < closest_distance:
                    closest_distance = distance
                    closest_entity = entity

        # Append the resolved entity to the list
        resolved.append(closest_entity)

    return resolved

def main():
    # Hardcoded input (replace with your own values)
    text_snippet = "**he** went to the store. **he** bought some milk. **it** was fresh. **he** drank it."
    entities = ["John", "store", "milk"]

    # Resolve pronouns
    result = resolve_pronouns(text_snippet, entities)

    # Output the resolved entities
    for entity in result:
        print(entity)

if __name__ == "__main__":
    main()


None
store
milk
milk
