In [8]:

import spacy
import nltk
import re

## Question 1

In [11]:
#URL
from nltk import word_tokenize
token=word_tokenize("I love spending time at https://www.xy123z.com/")
token

['I', 'love', 'spending', 'time', 'at', 'https', ':', '//www.xy123z.com/']

In [22]:
#using spacy we can counter the problem of url splitting
nlp=spacy.load("en_core_web_sm")
text="I love spending time at https://www.xy123z.com/"
doc=nlp(text)
for token in doc:
    print(token.text)

I
love
spending
time
at
https://www.xy123z.com/


In [13]:
urls = re.findall(r'https?://\S+', text)
print("URLs:", urls)

URLs: ['https://www.xy123z.com/']


In [5]:
#Identifying Email IDs
token=word_tokenize("My email ID is xyz111@gmail.com")
token

['My', 'email', 'ID', 'is', 'xyz111', '@', 'gmail.com']

In [16]:
#using spacy we can counter the problem of email splitting
text="My email ID is xyz111@gmail.com"
doc=nlp(text)
for token in doc:
    print(token.text)

My
email
ID
is
xyz111@gmail.com


In [17]:
emails = re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', text)
print("Emails:", emails)

Emails: ['xyz111@gmail.com']


In [18]:
# hashtags extraction
text="#Sushant is trending now in the world."
token=word_tokenize(text)
token

['#', 'Sushant', 'is', 'trending', 'now', 'in', 'the', 'world', '.']

In [19]:

hashtags=re.findall(r"#\w+",text)
hashtags

['#Sushant']

In [10]:
#mentions in text
token=word_tokenize("@Ajit, please help me")
token

['@', 'Ajit', ',', 'please', 'help', 'me']

In [11]:
text="@Ajit, please help me"
doc=nlp(text)
for token in doc:
    print(token.text)

@Ajit
,
please
help
me


In [12]:
mentions=re.findall(r"@\w+",text)
mentions

['@Ajit']

In [13]:
#Extracting numbers
text="8853147 sq. km of area washed away in floods."
doc=nlp(text)
for token in doc:
    print(token.text)

8853147
sq
.
km
of
area
washed
away
in
floods
.


In [14]:
numbers=[token.text for token in doc if token.like_num]
numbers

['8853147']

In [15]:
#Punctuations
text="Corona vius killed #24506 people. #Corona is un(tolerbable)."
doc=nlp(text)
for token in doc:
    print(token.text)

Corona
vius
killed
#
24506
people
.
#
Corona
is
un(tolerbable
)
.


In [16]:
punctuations=[token.text for token in doc if token.is_punct]
punctuations

['#', '.', '#', ')', '.']

In [17]:
#PAN numbers
token=word_tokenize("Valid PAN: ABCED3193P, Invalid: lEcGD012eg")
token

['Valid', 'PAN', ':', 'ABCED3193P', ',', 'Invalid', ':', 'lEcGD012eg']

In [18]:
text="Valid PAN: ABCED3193P, Invalid: lEcGD012eg"
pan_pattern=r"[A-Z]{5}[0-9]{4}[A-Z]{1}"
valid_pans=re.findall(pan_pattern,text)
valid_pans

['ABCED3193P']

In [19]:
#Repetitive Characters
token=word_tokenize("heyy this is a verrrry loong texttt")
token

['heyy', 'this', 'is', 'a', 'verrrry', 'loong', 'texttt']

In [20]:

text="heyy this is a verrrry loong texttt"
cleaned_text=re.sub(r'(.)\1+',r'\1',text)
print(cleaned_text)

hey this is a very long text


In [21]:
#Indian Mobile numbers
text="9990001796 is a phone number of the PMO office."
doc=nlp(text)
for token in doc:
    print(token.text)

9990001796
is
a
phone
number
of
the
PMO
office
.


In [22]:
mobile_nums=[token.text for token in doc if token.like_num and len(token.text) ==10]
mobile_nums

['9990001796']

In [23]:
#Capital words
text = "Ajit Doval is the National Security Advisor in India."
doc = nlp(text)
for token in doc:
    print(token.text)

Ajit
Doval
is
the
National
Security
Advisor
in
India
.


In [24]:
capital_words=[token.text for token in doc if token.is_title]
capital_words

['Ajit', 'Doval', 'National', 'Security', 'Advisor', 'India']

## Question 2

In [26]:
def calculate_edit_distance(str1,str2):
    m=len(str1)
    n=len(str2)

    dp=[[0 for x in range(n+1)] for x in range(m+1)] #table of size (m+1) x (n+1)

    for i in range(m+1):
        dp[i][0]=i  #if str2 empty, remove all ch from str1

    for j in range(n+1):
        dp[0][j]=j #if str1 empty , insert all ch from str2

    for i in range(1,m+1):
        for j in range(1,n+1):
            if str1[i-1]==str2[j-1]:
                dp[i][j]=dp[i-1][j-1]
            else:
                dp[i][j]=1+min(dp[i][j-1],
                               dp[i-1][j],
                               dp[i-1][j-1])
    edit_distance=dp[m][n]

    operations =[]
    i,j=m,n

    while i>0 and j>0:
        if str1[i-1] ==str2[j-1]:
            i-=1
            j-=1

        elif dp[i][j] ==dp[i-1][j-1]+1:
            operations.append(f"Substitute '{str1[i-1]}' with '{str2[j-1]}'")
            i-=1
            j-=1

        elif dp[i][j]==dp[i-1][j]+1:
            operations.append(f"Delete '{str1[i-1]}'")
            i-=1

        elif dp[i][j] == dp[i][j-1] +1:
            operations.append(f"Insert '{str2[j-1]}'")
            j-=1

    while i>0:
        operations.append(f"Delete '{str1[i-1]}'")
        i-=1
    while j>0:
        operations.append(f"Insert '{str2[j-1]}'")
        j-=1
    operations.reverse()

    return edit_distance,operations

In [27]:
test_cases = [
    ("kitten", "sitting"),
    ("flaw", "lawn"),
    ("distance", "editing")
]

print(f"{'String 1':<10} | {'String 2':<10} | {'Dist':<4} | {'Operations'}")
print("-" * 60)

for s1, s2 in test_cases:
    dist, ops = calculate_edit_distance(s1, s2)
    print(f"{s1:<10} | {s2:<10} | {dist:<4} |")
    for op in ops:
        print(f"{'':<28} + {op}")
    print("-" * 60)

String 1   | String 2   | Dist | Operations
------------------------------------------------------------
kitten     | sitting    | 3    |
                             + Substitute 'k' with 's'
                             + Substitute 'e' with 'i'
                             + Insert 'g'
------------------------------------------------------------
flaw       | lawn       | 2    |
                             + Delete 'f'
                             + Insert 'n'
------------------------------------------------------------
distance   | editing    | 5    |
                             + Insert 'e'
                             + Delete 's'
                             + Substitute 'a' with 'i'
                             + Delete 'c'
                             + Substitute 'e' with 'g'
------------------------------------------------------------
