<a href="https://colab.research.google.com/github/Adharsh0001/NLP/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing the Required Packages for NLP (Natural Language Processing)

In [1]:
import nltk
nltk.download("punkt")                        # for punctuation
nltk.download("wordnet")                      # for lemmatization
nltk.download("stopwords")                    # Coprus
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

Sentence Tokenizer

In [2]:
text = "Backgammon is one of the oldest known board games. Its history can be traced back nearly 5,000 years to archeological discoveries in the Middle East. It is a two player game where each player has fifteen checkers which move between twenty-four points according to the roll of two dice."
sentences = nltk.sent_tokenize(text)
for sentence in sentences:
  print(sentence)
  print()

Backgammon is one of the oldest known board games.

Its history can be traced back nearly 5,000 years to archeological discoveries in the Middle East.

It is a two player game where each player has fifteen checkers which move between twenty-four points according to the roll of two dice.



Word Tokenizer

In [3]:
for sentence in sentences:
  words = nltk.word_tokenize(sentence)
  print(words)
  print()

['Backgammon', 'is', 'one', 'of', 'the', 'oldest', 'known', 'board', 'games', '.']

['Its', 'history', 'can', 'be', 'traced', 'back', 'nearly', '5,000', 'years', 'to', 'archeological', 'discoveries', 'in', 'the', 'Middle', 'East', '.']

['It', 'is', 'a', 'two', 'player', 'game', 'where', 'each', 'player', 'has', 'fifteen', 'checkers', 'which', 'move', 'between', 'twenty-four', 'points', 'according', 'to', 'the', 'roll', 'of', 'two', 'dice', '.']



Stemmer and Lemmatizer

In [4]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

def compare_stemmer_and_lemmatizer(stemmer, lemmatizer, word, pos):
  print("Stemmer:" , stemmer.stem(word))
  print("Lemmatizer:", lemmatizer.lemmatize(word,pos))
  print()

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
compare_stemmer_and_lemmatizer(stemmer, lemmatizer, word = "seen", pos = wordnet.VERB)
compare_stemmer_and_lemmatizer(stemmer, lemmatizer, word = "drove", pos = wordnet.VERB)
compare_stemmer_and_lemmatizer(stemmer, lemmatizer, word = "better", pos = wordnet.ADJ)
compare_stemmer_and_lemmatizer(stemmer, lemmatizer, word = "improvised", pos = wordnet.VERB)

Stemmer: seen
Lemmatizer: see

Stemmer: drove
Lemmatizer: drive

Stemmer: better
Lemmatizer: good

Stemmer: improvis
Lemmatizer: improvise



Stop word removal

In [5]:
from nltk.corpus import stopwords
import os
os.listdir("/root/nltk_data/corpora/stopwords/")
stop_words = set(stopwords.words("english"))
print(stop_words)

{'is', "you're", 'while', 'weren', 'here', 'below', 'any', 'll', 'didn', 'themselves', 'can', 'yourselves', 'other', 'the', "should've", 'during', 'wouldn', 'with', 'as', 'own', 'ma', 'before', 'himself', 'of', 'theirs', 'from', 'few', 'all', "doesn't", "you'll", "needn't", 'there', 'so', 'won', 'under', 'no', 'had', 'an', 'him', 'does', 'only', 'yourself', "won't", 'your', 'do', 'did', 'very', "she's", 'and', 'aren', "shan't", 'o', 'each', 'mightn', 'because', 'if', 'by', "didn't", 'then', 'be', 'haven', "wasn't", 'these', 'after', 'most', 'hasn', 'her', 're', 'don', 'y', 'its', "mustn't", 'down', 'shouldn', 'once', 'against', 'them', 'now', "you've", 'ourselves', 'yours', 'were', 'to', "haven't", 'same', "hasn't", 'she', "couldn't", "you'd", 'why', 'both', 'in', 'being', 'me', 'out', 'shan', 'such', 'up', 'again', 'myself', 'been', 'or', 'm', 'further', "don't", 'having', 'but', 'should', "shouldn't", 'not', "that'll", 'over', 'a', 'on', 'ours', 'our', 'which', 'off', 't', 'mustn', '

In [6]:
stop_words = set(stopwords.words("english"))
sentence = "Backgammon is one of the oldest known board games."
words = nltk.word_tokenize(sentence)
without_stop_words = [word for word in words if not word in stop_words]
print(without_stop_words)

['Backgammon', 'one', 'oldest', 'known', 'board', 'games', '.']


Punctuation Removal

In [7]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('\w+')
tokenizer.tokenize("Eight-seven miles to go, yet. Onward")

['Eight', 'seven', 'miles', 'to', 'go', 'yet', 'Onward']

Part of Speech Tagging

In [8]:
sentence = "Backgammon is one of the oldest known board games."
words = nltk.word_tokenize(sentence)
tagged = nltk.pos_tag(words)
print(tagged)

[('Backgammon', 'NNP'), ('is', 'VBZ'), ('one', 'CD'), ('of', 'IN'), ('the', 'DT'), ('oldest', 'JJS'), ('known', 'VBN'), ('board', 'NN'), ('games', 'NNS'), ('.', '.')]


Bag of Words

In [9]:
documents = ["good movie good screenplay","good movie","bad movie sleep"]

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
count_vectorizer = CountVectorizer(binary = False)
bag_of_words = count_vectorizer.fit_transform(documents)
feature_names = count_vectorizer.get_feature_names_out()
pd.DataFrame(bag_of_words.toarray(),columns = feature_names)

Unnamed: 0,bad,good,movie,screenplay,sleep
0,0,2,1,1,0
1,0,1,1,0,0
2,1,0,1,0,1


In [11]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
count_vectorizer = CountVectorizer(ngram_range = (2,3) ,binary = False)
bag_of_words = count_vectorizer.fit_transform(documents)
feature_names = count_vectorizer.get_feature_names_out()
pd.DataFrame(bag_of_words.toarray(),columns = feature_names)

Unnamed: 0,bad movie,bad movie sleep,good movie,good movie good,good screenplay,movie good,movie good screenplay,movie sleep
0,0,0,1,1,1,1,1,0
1,0,0,1,0,0,0,0,0
2,1,1,0,0,0,0,0,1


TF_IDF

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
tfidf = TfidfVectorizer()
values = tfidf.fit_transform(documents)
feature_names = tfidf.get_feature_names_out()
pd.DataFrame(values.toarray(), columns = feature_names)

Unnamed: 0,bad,good,movie,screenplay,sleep
0,0.0,0.794803,0.308618,0.522535,0.0
1,0.0,0.789807,0.613356,0.0,0.0
2,0.652491,0.0,0.385372,0.0,0.652491
