In [1]:
#Tokenization....................
#  Step 1: Import spaCy
import spacy

# Step 2: Load the language model
nlp = spacy.load("en_core_web_sm")

# Step 3: Define the text to be tokenized
text = "SpaCy is a popular library for NLP tasks. It is fast and efficient."

# Step 4: Process the text using the NLP model
doc = nlp(text)

# Step 5: Tokenize and print tokens
print("Tokens:")
for token in doc:
    print(token)


Tokens:
SpaCy
is
a
popular
library
for
NLP
tasks
.
It
is
fast
and
efficient
.


In [9]:
#lemmatization..................
# Step 1: Import spaCy
import spacy

# Step 2: Load the language model
nlp = spacy.load("en_core_web_sm")

# Step 3: Define the text
text = "The children are playing with their toys. They played all day yesterday."

# Step 4: Process the text with the NLP model
doc = nlp(text)

# Step 5: Perform lemmatization
print("Lemmatized Tokens:")
for token in doc:
    print(f"Original: {token.text}, Lemma: {token.lemma_}")


Lemmatized Tokens:
Original: The, Lemma: the
Original: children, Lemma: child
Original: are, Lemma: be
Original: playing, Lemma: play
Original: with, Lemma: with
Original: their, Lemma: their
Original: toys, Lemma: toy
Original: ., Lemma: .
Original: They, Lemma: they
Original: played, Lemma: play
Original: all, Lemma: all
Original: day, Lemma: day
Original: yesterday, Lemma: yesterday
Original: ., Lemma: .


In [3]:
#Stop word Removal
# Step 1: Import spaCy
import spacy

# Step 2: Load the language model
nlp = spacy.load("en_core_web_sm")

# Step 3: Define the text
text = "This is a simple example demonstrating stop word removal using spaCy."

# Step 4: Process the text
doc = nlp(text)

# Step 5: Filter out stop words
filtered_tokens = [token.text for token in doc if not token.is_stop]

# Step 6: Display the result
print("Original Text:", text)
print("Text after Stop Word Removal:", " ".join(filtered_tokens))


Original Text: This is a simple example demonstrating stop word removal using spaCy.
Text after Stop Word Removal: simple example demonstrating stop word removal spaCy .


In [23]:
#Assignment 2............Assignment to implement Bag of Words and TFIDF using Gensim library.
# Importing required libraries
from gensim.corpora.dictionary import Dictionary
from gensim.models import TfidfModel


# Sample corpus (list of documents)
corpus = [
    "Natural Language Processing is fascinating",
    "I love learning about NLP",
    "The Bag of Bag Words model is a fundamental concept in NLP",
    "Gensim makes NLP easier and more efficient",
]

# Preprocessing: Tokenization
# Convert each document into a list of words
tokenized_corpus = [doc.lower().split() for doc in corpus]

# Step 1: Create a dictionary
dictionary = Dictionary(tokenized_corpus)

# Step 2: Convert the documents into a bag-of-words format
# (list of tuples (token_id, token_count) for each document)
bow_corpus = [dictionary.doc2bow(doc) for doc in tokenized_corpus]

# Display the dictionary and the BoW representation
print("Dictionary:")
print(dictionary.token2id)
print("\nBag-of-Words Corpus:")
for i, bow in enumerate(bow_corpus):
    print(f"Document {i+1}: {bow}")

# Step 3: Compute the term frequency-inverse document frequency (TF-IDF) scores (optional)
tfidf = TfidfModel(bow_corpus)
tfidf_corpus = [tfidf[doc] for doc in bow_corpus]

# Display the TF-IDF representation
print("\nTF-IDF Corpus:")
for i, tfidf_doc in enumerate(tfidf_corpus):
    print(f"Document {i+1}: {tfidf_doc}")

# Additional Information:
# The `dictionary` maps words to their integer IDs.
# The `bow_corpus` represents each document as a sparse vector of (word_id, frequency).
# The `tfidf_corpus` shows TF-IDF weights for words in each document.


Dictionary:
{'fascinating': 0, 'is': 1, 'language': 2, 'natural': 3, 'processing': 4, 'about': 5, 'i': 6, 'learning': 7, 'love': 8, 'nlp': 9, 'a': 10, 'bag': 11, 'concept': 12, 'fundamental': 13, 'in': 14, 'model': 15, 'of': 16, 'the': 17, 'words': 18, 'and': 19, 'easier': 20, 'efficient': 21, 'gensim': 22, 'makes': 23, 'more': 24}

Bag-of-Words Corpus:
Document 1: [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]
Document 2: [(5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]
Document 3: [(1, 1), (9, 1), (10, 1), (11, 2), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1)]
Document 4: [(9, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1)]

TF-IDF Corpus:
Document 1: [(0, 0.48507125007266594), (1, 0.24253562503633297), (2, 0.48507125007266594), (3, 0.48507125007266594), (4, 0.48507125007266594)]
Document 2: [(5, 0.49733003742454074), (6, 0.49733003742454074), (7, 0.49733003742454074), (8, 0.49733003742454074), (9, 0.10320530752446756)]
Document 3: [(1, 0.1426067007488231), (9, 0.0591871

In [5]:
#Assignment 3.................... Name Entity Recognition in python with spacy
# Step 1: Install spaCy if you haven't already
# pip install spacy

# Step 2: Download the English language model
# python -m spacy download en_core_web_sm

import spacy

# Load the pre-trained spaCy English language model
nlp = spacy.load("en_core_web_sm")

# Textual data to analyze
text = """
Albert Einstein was a theoretical physicist who developed the theory of relativity. 
He was born in Ulm, Germany, on March 14, 1879. Einstein won the Nobel Prize in Physics in 1921. 
Google was founded in September 1998 by Larry Page and Sergey Brin while they were Ph.D. students at Stanford University.
"""

# Process the text with spaCy
doc = nlp(text)

# Print named entities, their labels, and positions
print("Named Entities, Phrases, and Concepts:")
for ent in doc.ents:
    print(f"{ent.text} ({ent.label_})")




Named Entities, Phrases, and Concepts:
Albert Einstein (PERSON)
Ulm (GPE)
Germany (GPE)
March 14, 1879 (DATE)
the Nobel Prize in Physics (WORK_OF_ART)
1921 (DATE)
Google (ORG)
September 1998 (DATE)
Larry Page (PERSON)
Sergey Brin (PERSON)
Ph.D. (WORK_OF_ART)
Stanford University (ORG)


In [40]:
from nltk.util import ngrams
from nltk import ngrams

# Input text
sentence = 'Earth is the third planet from the Sun in our solar system and the only known celestial body to support life. With a diverse range of ecosystems, it is home to a vast array of plant and animal species, including humans.'

# Unigram model
n = 1
unigrams = ngrams(sentence.split(), n)
print(f"\n***********   UNIGRAM    ************************")
for item in unigrams:
    print(item)

# Bigram model
n = 2
bigrams = ngrams(sentence.split(), n)
print(f"\n***********   BIGRAM    ************************")
for item in bigrams:
    print(item)

# Trigram model
n = 3
trigrams = ngrams(sentence.split(), n)
print(f"\n***********   TRIGRAM    ************************")
for item in trigrams:
    print(item)



***********   UNIGRAM    ************************
('Earth',)
('is',)
('the',)
('third',)
('planet',)
('from',)
('the',)
('Sun',)
('in',)
('our',)
('solar',)
('system',)
('and',)
('the',)
('only',)
('known',)
('celestial',)
('body',)
('to',)
('support',)
('life.',)
('With',)
('a',)
('diverse',)
('range',)
('of',)
('ecosystems,',)
('it',)
('is',)
('home',)
('to',)
('a',)
('vast',)
('array',)
('of',)
('plant',)
('and',)
('animal',)
('species,',)
('including',)
('humans.',)

***********   BIGRAM    ************************
('Earth', 'is')
('is', 'the')
('the', 'third')
('third', 'planet')
('planet', 'from')
('from', 'the')
('the', 'Sun')
('Sun', 'in')
('in', 'our')
('our', 'solar')
('solar', 'system')
('system', 'and')
('and', 'the')
('the', 'only')
('only', 'known')
('known', 'celestial')
('celestial', 'body')
('body', 'to')
('to', 'support')
('support', 'life.')
('life.', 'With')
('With', 'a')
('a', 'diverse')
('diverse', 'range')
('range', 'of')
('of', 'ecosystems,')
('ecosystems,', 'i

In [41]:
#Assignment 5.............Implement regular expression function to find URL, IP address, Date,
#PAN number in textual data using python libraries
import re

def find_urls(text):
    url_pattern = r"https?://(?:www\.)?\S+|www\.\S+"
    return re.findall(url_pattern, text)

def find_ip_addresses(text):
    ip_pattern = r"\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b"
    return re.findall(ip_pattern, text)

def find_dates(text):
    date_pattern = r"\b(?:\d{1,2}[-/]){2}\d{2,4}\b|\b\d{4}[-/]\d{1,2}[-/]\d{1,2}\b"
    return re.findall(date_pattern, text)

def find_pan_numbers(text):
    pan_pattern = r"\b[A-Z]{5}[0-9]{4}[A-Z]\b"
    return re.findall(pan_pattern, text)

# Example usage
if __name__ == "__main__":
    sample_text = """Here are some examples:
    - URL: https://www.example.com and http://testsite.com
    - IP: 192.168.0.1 and 8.8.8.8
    - Dates: 12/09/2023, 2023-09-12, and 31-12-2023
    - PAN: ABCDE1234F and XYWZP9876L
    """

    urls = find_urls(sample_text)
    ip_addresses = find_ip_addresses(sample_text)
    dates = find_dates(sample_text)
    pan_numbers = find_pan_numbers(sample_text)

    print("URLs:", urls)
    print("IP Addresses:", ip_addresses)
    print("Dates:", dates)
    print("PAN Numbers:", pan_numbers)


URLs: ['https://www.example.com', 'http://testsite.com']
IP Addresses: ['192.168.0.1', '8.8.8.8']
Dates: ['12/09/2023', '2023-09-12', '31-12-2023']
PAN Numbers: ['ABCDE1234F', 'XYWZP9876L']


In [43]:
#Assignment 6..................
###  Assignment No 6 ###
#Name : Shital Rahane
#Batch : B3
#Roll No : 48
"""Assignment Title : : Implement and visualize Dependency Parsing of Textual Input
using Stan- ford CoreNLP and Spacy library"""


import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

multiline_text = """
I prefer the morning flight through denver
"""

multiline_doc = nlp(multiline_text)

for token in multiline_doc:
    print(
        f"""
TOKEN: {token.text}
=====
{token.tag_ = }
{token.head.text = }
{token.dep_ = }"""
    )

displacy.serve(multiline_doc, style="dep")






TOKEN: 

=====
token.tag_ = '_SP'
token.head.text = 'I'
token.dep_ = 'dep'

TOKEN: I
=====
token.tag_ = 'PRP'
token.head.text = 'prefer'
token.dep_ = 'nsubj'

TOKEN: prefer
=====
token.tag_ = 'VBP'
token.head.text = 'prefer'
token.dep_ = 'ROOT'

TOKEN: the
=====
token.tag_ = 'DT'
token.head.text = 'flight'
token.dep_ = 'det'

TOKEN: morning
=====
token.tag_ = 'NN'
token.head.text = 'flight'
token.dep_ = 'compound'

TOKEN: flight
=====
token.tag_ = 'NN'
token.head.text = 'prefer'
token.dep_ = 'dobj'

TOKEN: through
=====
token.tag_ = 'IN'
token.head.text = 'flight'
token.dep_ = 'prep'

TOKEN: denver
=====
token.tag_ = 'NN'
token.head.text = 'through'
token.dep_ = 'pobj'

TOKEN: 

=====
token.tag_ = '_SP'
token.head.text = 'denver'
token.dep_ = 'dep'



Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.
