In [1]:
# Extract Sample document and apply following document preprocessing methods:
# Tokenization, POS Tagging, stop words removal, Stemming and Lemmatization.

In [2]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [3]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize, sent_tokenize
import pandas as pd
import sklearn as sk
import math 


[nltk_data] Downloading package punkt to /home/TE/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
sent = "GeeksforGeeks is a great learning platform . It is one of the best for Computer Science students. play playing played communication "
words = (word_tokenize(sent))
sentences = (sent_tokenize(sent))

In [5]:
print(words)
print(sentences)

['GeeksforGeeks', 'is', 'a', 'great', 'learning', 'platform', '.', 'It', 'is', 'one', 'of', 'the', 'best', 'for', 'Computer', 'Science', 'students', '.', 'play', 'playing', 'played', 'communication']
['GeeksforGeeks is a great learning platform .', 'It is one of the best for Computer Science students.', 'play playing played communication']


In [6]:
from nltk.stem import PorterStemmer

In [7]:
ps = PorterStemmer()
stem = []
for i in words:
    stem_word = ps.stem(i);
    stem.append(stem_word);
    
print(stem)
    

['geeksforgeek', 'is', 'a', 'great', 'learn', 'platform', '.', 'it', 'is', 'one', 'of', 'the', 'best', 'for', 'comput', 'scienc', 'student', '.', 'play', 'play', 'play', 'commun']


In [8]:

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lem = []
for i in words:
    lem_word = lemmatizer.lemmatize(i,'v');
    lem.append(lem_word);
    
print(lem)


['GeeksforGeeks', 'be', 'a', 'great', 'learn', 'platform', '.', 'It', 'be', 'one', 'of', 'the', 'best', 'for', 'Computer', 'Science', 'students', '.', 'play', 'play', 'play', 'communication']


In [9]:
from nltk import pos_tag


In [10]:
from nltk.corpus import stopwords


In [11]:
tags = pos_tag(words)

In [12]:
print(tags)
sw_nltk = stopwords.words('english')

[('GeeksforGeeks', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('great', 'JJ'), ('learning', 'JJ'), ('platform', 'NN'), ('.', '.'), ('It', 'PRP'), ('is', 'VBZ'), ('one', 'CD'), ('of', 'IN'), ('the', 'DT'), ('best', 'JJS'), ('for', 'IN'), ('Computer', 'NNP'), ('Science', 'NNP'), ('students', 'NNS'), ('.', '.'), ('play', 'VB'), ('playing', 'VBG'), ('played', 'JJ'), ('communication', 'NN')]


In [13]:
sw_nltk = stopwords.words('english')
print(sw_nltk)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [14]:

wordAbs = [word for word in words if word.lower() not in sw_nltk]
new_text = " ".join(wordAbs)
print(new_text)

GeeksforGeeks great learning platform . one best Computer Science students . play playing played communication


In [15]:
# Create representation of document by calculating Term Frequency and Inverse Document
# Frequency.

In [16]:
first_sentence = "Data Science is the sexiest job of the 21st century"
second_sentence = "machine learning is the key for data science"
#split so each word have their own string
first_sentence = first_sentence.split(" ")
second_sentence = second_sentence.split(" ")#join them to remove common duplicate words
total= set(first_sentence).union(set(second_sentence))
print(total)

{'Science', 'is', 'data', 'century', 'job', 'the', 'Data', 'of', '21st', 'sexiest', 'machine', 'key', 'science', 'learning', 'for'}


In [17]:
wordDictA = dict.fromkeys(total, 0) 
wordDictB = dict.fromkeys(total, 0)
for word in first_sentence:
    wordDictA[word]+=1
    
for word in second_sentence:
    wordDictB[word]+=1

In [18]:
pd.DataFrame([wordDictA, wordDictB])

Unnamed: 0,Science,is,data,century,job,the,Data,of,21st,sexiest,machine,key,science,learning,for
0,1,1,0,1,1,2,1,1,1,1,0,0,0,0,0
1,0,1,1,0,0,1,0,0,0,0,1,1,1,1,1


In [19]:
def computeTF(wordDict, doc):
    tfDict = {}
    corpusCount = len(doc)
    for word, count in wordDict.items():
        tfDict[word] = count/float(corpusCount)
    return(tfDict)
#running our sentences through the tf function:
tfFirst = computeTF(wordDictA, first_sentence)
tfSecond = computeTF(wordDictB, second_sentence)
#Converting to dataframe for visualization
tf = pd.DataFrame([tfFirst, tfSecond])

In [20]:
tf

Unnamed: 0,Science,is,data,century,job,the,Data,of,21st,sexiest,machine,key,science,learning,for
0,0.1,0.1,0.0,0.1,0.1,0.2,0.1,0.1,0.1,0.1,0.0,0.0,0.0,0.0,0.0
1,0.0,0.125,0.125,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.125,0.125,0.125,0.125,0.125


In [21]:
def computeIDF(docList):
    idfDict = {}
    N = len(docList)
    
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / (float(val) + 1))
        
    return(idfDict)
#inputing our sentences in the log file
idfs = computeIDF([wordDictA, wordDictB])

In [22]:
idfs

{'Science': 0.3010299956639812,
 'is': 0.3010299956639812,
 'data': 0.3010299956639812,
 'century': 0.3010299956639812,
 'job': 0.3010299956639812,
 'the': 0.3010299956639812,
 'Data': 0.3010299956639812,
 'of': 0.3010299956639812,
 '21st': 0.3010299956639812,
 'sexiest': 0.3010299956639812,
 'machine': 0.3010299956639812,
 'key': 0.3010299956639812,
 'science': 0.3010299956639812,
 'learning': 0.3010299956639812,
 'for': 0.3010299956639812}