# Tokenization

In [31]:
import nltk
from nltk import sent_tokenize
from nltk import word_tokenize

In [32]:
text='Real madrid is set to win the UCL for the season . Benzema might win Balon dor . Salah might be the runner up'

In [33]:
sent_tkn = sent_tokenize(text)
word_tkn = word_tokenize(text)

print("Sentence Tokenization: ", sent_tkn)
print("Word Tokenization: ",word_tkn)

Sentence Tokenization:  ['Real madrid is set to win the UCL for the season .', 'Benzema might win Balon dor .', 'Salah might be the runner up']
Word Tokenization:  ['Real', 'madrid', 'is', 'set', 'to', 'win', 'the', 'UCL', 'for', 'the', 'season', '.', 'Benzema', 'might', 'win', 'Balon', 'dor', '.', 'Salah', 'might', 'be', 'the', 'runner', 'up']


# POS(Part of Speech) Tagging

In [34]:
from nltk import pos_tag

In [35]:
print("Tokenized words with tags: ")
pos_tag(word_tkn)

Tokenized words with tags: 


[('Real', 'JJ'),
 ('madrid', 'NN'),
 ('is', 'VBZ'),
 ('set', 'VBN'),
 ('to', 'TO'),
 ('win', 'VB'),
 ('the', 'DT'),
 ('UCL', 'NNP'),
 ('for', 'IN'),
 ('the', 'DT'),
 ('season', 'NN'),
 ('.', '.'),
 ('Benzema', 'NNP'),
 ('might', 'MD'),
 ('win', 'VB'),
 ('Balon', 'NNP'),
 ('dor', 'NN'),
 ('.', '.'),
 ('Salah', 'NNP'),
 ('might', 'MD'),
 ('be', 'VB'),
 ('the', 'DT'),
 ('runner', 'NN'),
 ('up', 'RP')]

# Stopwords

In [36]:
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhishek\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abhishek\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [37]:
stop_words = set(stopwords.words('english'))
print("Stop Words: ",stop_words)

Stop Words:  {'isn', 'me', 'hasn', 'hers', 'whom', 'this', 'out', 'haven', 'both', 'my', "shouldn't", 'weren', 'ma', 'have', 'between', "don't", 'll', 'so', 'yourselves', 'yours', 'he', 'i', 'needn', 'these', 'been', 'at', 'she', "she's", 'here', 'from', 'we', 'doing', 'our', 'a', 'did', "you'll", 'didn', 'can', 'having', "mightn't", 'the', 'doesn', 'what', 'if', 'had', 'are', 'is', 'each', 'aren', 'wasn', 'has', 'shouldn', "shan't", 'once', 'm', 'myself', 'as', 've', 'was', 'through', 'after', 'to', 'all', 'how', 'ain', 'were', 'same', "needn't", 'below', 'her', 'themselves', 'himself', 'ours', 'ourselves', 'only', 'don', 'too', 'not', 'or', 'further', 'while', 'again', 'over', 'by', "haven't", 'his', 'against', 's', 'will', 'which', 'you', "aren't", "weren't", 'couldn', 'won', 'it', 'then', 'such', "couldn't", 'than', 'itself', 'for', 'there', 'mustn', 'shan', 'of', 'with', 'd', "wasn't", 'about', 'its', 'hadn', 'their', 'who', "that'll", 'herself', 'most', 'nor', 'because', 'why', "

In [38]:
# Removal of stopwords

filtered_tokens = []
for token in word_tkn:
    if token.lower() not in stop_words:
        filtered_tokens.append(token)

print(filtered_tokens)

['Real', 'madrid', 'set', 'win', 'UCL', 'season', '.', 'Benzema', 'might', 'win', 'Balon', 'dor', '.', 'Salah', 'might', 'runner']


# Stemming with NLTK

In [39]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abhishek\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [41]:
words = ["better", "Playing", "Studies", "Gaming", "Played", "the", "Teaching", "Play", "Run", "Running", "Coding", "Laughing"]

stemmed_word = []

ps = PorterStemmer()

for word in words:
    stemmed_word.append(ps.stem(word))

print("Stemmed Words: ", stemmed_word)

Stemmed Words:  ['better', 'play', 'studi', 'game', 'play', 'the', 'teach', 'play', 'run', 'run', 'code', 'laugh']


# Lemmatization with NLTK

In [58]:
from nltk.stem import WordNetLemmatizer
 
lemmatizer = WordNetLemmatizer()
 
print("rocks :", lemmatizer.lemmatize("rocks"))
print("corpora :", lemmatizer.lemmatize("corpora"))
print("risks:", lemmatizer.lemmatize("risks"))

rocks : rock
corpora : corpus
risks: risk


In [78]:
words = ["corpora", "risks", "studies", "bottles", "rocks", "the", "teachers"]

lemmatized_word = []

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

for word in words:
    lemmatized_word.append(lemmatizer.lemmatize(word))

print("Lemmatized Words: ", lemmatized_word)

Lemmatized Words:  ['corpus', 'risk', 'study', 'bottle', 'rock', 'the', 'teacher']


# TF-IDF

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample documents
documents = ["My name is saish and Abhishek", "why My name is saish", "agaye maut ka tamasha dekhne"]

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Compute TF-IDF values
tfidf_matrix = vectorizer.fit_transform(documents)
print(tfidf_matrix)

# Get feature names (terms)
feature_names = vectorizer.get_feature_names_out()
print("\n", feature_names, "\n\n") 

# Display TF-IDF values for each document
for i in range(len(feature_names)):
  print("{} : {} ".format(feature_names[i],tfidf_matrix[0,i]))


  (0, 0)	0.4814821314936913
  (0, 2)	0.4814821314936913
  (0, 9)	0.3661795714211074
  (0, 4)	0.3661795714211074
  (0, 8)	0.3661795714211074
  (0, 7)	0.3661795714211074
  (1, 11)	0.5493512310263033
  (1, 9)	0.41779577097245885
  (1, 4)	0.41779577097245885
  (1, 8)	0.41779577097245885
  (1, 7)	0.41779577097245885
  (2, 3)	0.4472135954999579
  (2, 10)	0.4472135954999579
  (2, 5)	0.4472135954999579
  (2, 6)	0.4472135954999579
  (2, 1)	0.4472135954999579

 ['abhishek' 'agaye' 'and' 'dekhne' 'is' 'ka' 'maut' 'my' 'name' 'saish'
 'tamasha' 'why'] 


abhishek : 0.4814821314936913 
agaye : 0.0 
and : 0.4814821314936913 
dekhne : 0.0 
is : 0.3661795714211074 
ka : 0.0 
maut : 0.0 
my : 0.3661795714211074 
name : 0.3661795714211074 
saish : 0.3661795714211074 
tamasha : 0.0 
why : 0.0 
