# Generate the output (changes or transformations in the data) manually when the following tasks are applied on the input text. Show your output in details.

## Installing the Library

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

## Using five documents

In [2]:
documentA = 'the man went out for a walk'
documentB = 'the children sat around the fire'
documentC = 'the family went for a picnic in the park'
documentD = 'the children were running in the park'
documentE = 'the elder people sat around the fire'

## Extracting features from the texts in a bucket

In [3]:
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')
bagOfWordsC = documentC.split(' ')
bagOfWordsD = documentD.split(' ')
bagOfWordsE = documentE.split(' ')

## Removing any duplicate words.

In [5]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB)).union(set(bagOfWordsC)).union(set(bagOfWordsD)).union(set(bagOfWordsE))

##  Creating a dictionary of words

In [6]:
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
    numOfWordsA[word] += 1
numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
    numOfWordsB[word] += 1
numOfWordsC = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsC:
    numOfWordsC[word] += 1
numOfWordsD = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsD:
    numOfWordsD[word] += 1
numOfWordsE = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsE:
    numOfWordsE[word] += 1

## Removing useless words using the stopwords module
## Term Frequency (TF)

- The number of times a word appears in a document divded by the total number of words in the document. Every document has its own term frequency.

In [7]:
from nltk.corpus import stopwords
stopwords.words('english')
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

In [8]:
tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)
tfC = computeTF(numOfWordsC, bagOfWordsC)
tfD = computeTF(numOfWordsD, bagOfWordsD)
tfE = computeTF(numOfWordsE, bagOfWordsE)

In [9]:
print(tfA)

{'running': 0.0, 'man': 0.14285714285714285, 'park': 0.0, 'children': 0.0, 'fire': 0.0, 'in': 0.0, 'the': 0.14285714285714285, 'elder': 0.0, 'family': 0.0, 'went': 0.14285714285714285, 'were': 0.0, 'sat': 0.0, 'a': 0.14285714285714285, 'walk': 0.14285714285714285, 'for': 0.14285714285714285, 'people': 0.0, 'out': 0.14285714285714285, 'picnic': 0.0, 'around': 0.0}


In [10]:
print(tfB)

{'running': 0.0, 'man': 0.0, 'park': 0.0, 'children': 0.16666666666666666, 'fire': 0.16666666666666666, 'in': 0.0, 'the': 0.3333333333333333, 'elder': 0.0, 'family': 0.0, 'went': 0.0, 'were': 0.0, 'sat': 0.16666666666666666, 'a': 0.0, 'walk': 0.0, 'for': 0.0, 'people': 0.0, 'out': 0.0, 'picnic': 0.0, 'around': 0.16666666666666666}


In [11]:
print(tfC)

{'running': 0.0, 'man': 0.0, 'park': 0.1111111111111111, 'children': 0.0, 'fire': 0.0, 'in': 0.1111111111111111, 'the': 0.2222222222222222, 'elder': 0.0, 'family': 0.1111111111111111, 'went': 0.1111111111111111, 'were': 0.0, 'sat': 0.0, 'a': 0.1111111111111111, 'walk': 0.0, 'for': 0.1111111111111111, 'people': 0.0, 'out': 0.0, 'picnic': 0.1111111111111111, 'around': 0.0}


In [12]:
print(tfD)

{'running': 0.14285714285714285, 'man': 0.0, 'park': 0.14285714285714285, 'children': 0.14285714285714285, 'fire': 0.0, 'in': 0.14285714285714285, 'the': 0.2857142857142857, 'elder': 0.0, 'family': 0.0, 'went': 0.0, 'were': 0.14285714285714285, 'sat': 0.0, 'a': 0.0, 'walk': 0.0, 'for': 0.0, 'people': 0.0, 'out': 0.0, 'picnic': 0.0, 'around': 0.0}


In [13]:
print(tfE)

{'running': 0.0, 'man': 0.0, 'park': 0.0, 'children': 0.0, 'fire': 0.14285714285714285, 'in': 0.0, 'the': 0.2857142857142857, 'elder': 0.14285714285714285, 'family': 0.0, 'went': 0.0, 'were': 0.0, 'sat': 0.14285714285714285, 'a': 0.0, 'walk': 0.0, 'for': 0.0, 'people': 0.14285714285714285, 'out': 0.0, 'picnic': 0.0, 'around': 0.14285714285714285}


## Inverse Data Frequency (IDF)
- The log of the number of documents divided by the number of documents that contain the word w.

In [14]:
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

In [15]:
idfs = computeIDF([numOfWordsA, numOfWordsB, numOfWordsC, numOfWordsD, numOfWordsE])

In [16]:
print(idfs)

{'running': 1.6094379124341003, 'man': 1.6094379124341003, 'park': 0.9162907318741551, 'children': 0.9162907318741551, 'fire': 0.9162907318741551, 'in': 0.9162907318741551, 'the': 0.0, 'elder': 1.6094379124341003, 'family': 1.6094379124341003, 'went': 0.9162907318741551, 'were': 1.6094379124341003, 'sat': 0.9162907318741551, 'a': 0.9162907318741551, 'walk': 1.6094379124341003, 'for': 0.9162907318741551, 'people': 1.6094379124341003, 'out': 1.6094379124341003, 'picnic': 1.6094379124341003, 'around': 0.9162907318741551}


## Lastly, Find out the top TF-IDF words for the above input

In [17]:
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [18]:
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
tfidfC = computeTFIDF(tfC, idfs)
tfidfD = computeTFIDF(tfD, idfs)
tfidfE = computeTFIDF(tfE, idfs)
df = pd.DataFrame([tfidfA, tfidfB, tfidfC, tfidfD, tfidfE])

In [19]:
print(df)

          a    around  children    elder    family      fire       for  \
0  0.130899  0.000000  0.000000  0.00000  0.000000  0.000000  0.130899   
1  0.000000  0.152715  0.152715  0.00000  0.000000  0.152715  0.000000   
2  0.101810  0.000000  0.000000  0.00000  0.178826  0.000000  0.101810   
3  0.000000  0.000000  0.130899  0.00000  0.000000  0.000000  0.000000   
4  0.000000  0.130899  0.000000  0.22992  0.000000  0.130899  0.000000   

         in      man      out      park   people    picnic  running       sat  \
0  0.000000  0.22992  0.22992  0.000000  0.00000  0.000000  0.00000  0.000000   
1  0.000000  0.00000  0.00000  0.000000  0.00000  0.000000  0.00000  0.152715   
2  0.101810  0.00000  0.00000  0.101810  0.00000  0.178826  0.00000  0.000000   
3  0.130899  0.00000  0.00000  0.130899  0.00000  0.000000  0.22992  0.000000   
4  0.000000  0.00000  0.00000  0.000000  0.22992  0.000000  0.00000  0.130899   

   the     walk      went     were  
0  0.0  0.22992  0.130899  0.00

## Find out the top TF-IDF words for the lemmatized input

In [20]:
import nltk
from nltk.stem import WordNetLemmatizer 

In [21]:
lemmatizer = WordNetLemmatizer()

In [22]:
word_listA = nltk.word_tokenize(documentA)
word_listB = nltk.word_tokenize(documentB)
word_listC = nltk.word_tokenize(documentC)
word_listD = nltk.word_tokenize(documentD)
word_listE = nltk.word_tokenize(documentE)

In [23]:
lemmatized_documentA = ' '.join([lemmatizer.lemmatize(w) for w in word_listA])
lemmatized_documentB = ' '.join([lemmatizer.lemmatize(w) for w in word_listB])
lemmatized_documentC = ' '.join([lemmatizer.lemmatize(w) for w in word_listC])
lemmatized_documentD = ' '.join([lemmatizer.lemmatize(w) for w in word_listD])
lemmatized_documentE = ' '.join([lemmatizer.lemmatize(w) for w in word_listE])

In [24]:
lbagOfWordsA = lemmatized_documentA.split(' ')
lbagOfWordsB = lemmatized_documentB.split(' ')
lbagOfWordsC = lemmatized_documentC.split(' ')
lbagOfWordsD = lemmatized_documentD.split(' ')
lbagOfWordsE = lemmatized_documentE.split(' ')

In [25]:
luniqueWords = set(lbagOfWordsA).union(set(lbagOfWordsB)).union(set(lbagOfWordsC)).union(set(lbagOfWordsD)).union(set(lbagOfWordsE))

In [26]:
lnumOfWordsA = dict.fromkeys(luniqueWords, 0)
for word in lbagOfWordsA:
    lnumOfWordsA[word] += 1
lnumOfWordsB = dict.fromkeys(luniqueWords, 0)
for word in lbagOfWordsB:
    lnumOfWordsB[word] += 1
lnumOfWordsC = dict.fromkeys(luniqueWords, 0)
for word in lbagOfWordsC:
    lnumOfWordsC[word] += 1
lnumOfWordsD = dict.fromkeys(luniqueWords, 0)
for word in lbagOfWordsD:
    lnumOfWordsD[word] += 1
lnumOfWordsE = dict.fromkeys(luniqueWords, 0)
for word in lbagOfWordsE:
    lnumOfWordsE[word] += 1

In [27]:
ltfA = computeTF(lnumOfWordsA, lbagOfWordsA)
ltfB = computeTF(lnumOfWordsB, lbagOfWordsB)
ltfC = computeTF(lnumOfWordsC, lbagOfWordsC)
ltfD = computeTF(lnumOfWordsD, lbagOfWordsD)
ltfE = computeTF(lnumOfWordsE, lbagOfWordsE)

In [28]:
lidfs = computeIDF([lnumOfWordsA, lnumOfWordsB, lnumOfWordsC, lnumOfWordsD, lnumOfWordsE])

In [29]:
ltfidfA = computeTFIDF(ltfA, lidfs)
ltfidfB = computeTFIDF(ltfB, lidfs)
ltfidfC = computeTFIDF(ltfC, lidfs)
ltfidfD = computeTFIDF(ltfD, lidfs)
ltfidfE = computeTFIDF(ltfE, lidfs)
ldf = pd.DataFrame([ltfidfA, ltfidfB, ltfidfC, ltfidfD, ltfidfE])

In [30]:
print(ldf)

          a    around     child    elder    family      fire       for  \
0  0.130899  0.000000  0.000000  0.00000  0.000000  0.000000  0.130899   
1  0.000000  0.152715  0.152715  0.00000  0.000000  0.152715  0.000000   
2  0.101810  0.000000  0.000000  0.00000  0.178826  0.000000  0.101810   
3  0.000000  0.000000  0.130899  0.00000  0.000000  0.000000  0.000000   
4  0.000000  0.130899  0.000000  0.22992  0.000000  0.130899  0.000000   

         in      man      out      park   people    picnic  running       sat  \
0  0.000000  0.22992  0.22992  0.000000  0.00000  0.000000  0.00000  0.000000   
1  0.000000  0.00000  0.00000  0.000000  0.00000  0.000000  0.00000  0.152715   
2  0.101810  0.00000  0.00000  0.101810  0.00000  0.178826  0.00000  0.000000   
3  0.130899  0.00000  0.00000  0.130899  0.00000  0.000000  0.22992  0.000000   
4  0.000000  0.00000  0.00000  0.000000  0.22992  0.000000  0.00000  0.130899   

   the     walk      went     were  
0  0.0  0.22992  0.130899  0.00

## Find out the top TF-IDF words for the n-gram based input.

In [31]:
listA = [documentA]
listB = [documentB]
listC = [documentC]
listD = [documentD]
listE = [documentE]

In [32]:
resA = [(x, i.split()[j + 1]) for i in listA  
       for j, x in enumerate(i.split()) if j < len(i.split()) - 1]
resB = [(x, i.split()[j + 1]) for i in listB  
       for j, x in enumerate(i.split()) if j < len(i.split()) - 1]
resC = [(x, i.split()[j + 1]) for i in listC  
       for j, x in enumerate(i.split()) if j < len(i.split()) - 1]
resD = [(x, i.split()[j + 1]) for i in listD  
       for j, x in enumerate(i.split()) if j < len(i.split()) - 1]
resE = [(x, i.split()[j + 1]) for i in listE  
       for j, x in enumerate(i.split()) if j < len(i.split()) - 1]

In [33]:
runiqueWords = set(resA).union(set(resB)).union(set(resC)).union(set(resD)).union(set(resE))

In [34]:
rnumOfWordsA = dict.fromkeys(runiqueWords, 0)
for word in resA:
    rnumOfWordsA[word] += 1
rnumOfWordsB = dict.fromkeys(runiqueWords, 0)
for word in resB:
    rnumOfWordsB[word] += 1
rnumOfWordsC = dict.fromkeys(runiqueWords, 0)
for word in resC:
    rnumOfWordsC[word] += 1
rnumOfWordsD = dict.fromkeys(runiqueWords, 0)
for word in resD:
    rnumOfWordsD[word] += 1
rnumOfWordsE = dict.fromkeys(runiqueWords, 0)
for word in resE:
    rnumOfWordsE[word] += 1

In [35]:
rtfA = computeTF(rnumOfWordsA, resA)
rtfB = computeTF(rnumOfWordsB, resB)
rtfC = computeTF(rnumOfWordsC, resC)
rtfD = computeTF(rnumOfWordsD, resD)
rtfE = computeTF(rnumOfWordsE, resE)

In [36]:
ridfs = computeIDF([rnumOfWordsA, rnumOfWordsB, rnumOfWordsC, rnumOfWordsD, rnumOfWordsE])

In [37]:
rtfidfA = computeTFIDF(rtfA, ridfs)
rtfidfB = computeTFIDF(rtfB, ridfs)
rtfidfC = computeTFIDF(rtfC, ridfs)
rtfidfD = computeTFIDF(rtfD, ridfs)
rtfidfE = computeTFIDF(rtfE, ridfs)
rdf = pd.DataFrame([rtfidfA, rtfidfB, rtfidfC, rtfidfD, rtfidfE])

In [38]:
print(rdf)

   (a, picnic)  (a, walk)  (around, the)  (children, sat)  (children, were)  \
0      0.00000    0.26824       0.000000         0.000000           0.00000   
1      0.00000    0.00000       0.183258         0.321888           0.00000   
2      0.20118    0.00000       0.000000         0.000000           0.00000   
3      0.00000    0.00000       0.000000         0.000000           0.26824   
4      0.00000    0.00000       0.152715         0.000000           0.00000   

   (elder, people)  (family, went)  (for, a)  (in, the)  (man, went)  ...  \
0          0.00000         0.00000  0.152715   0.000000      0.26824  ...   
1          0.00000         0.00000  0.000000   0.000000      0.00000  ...   
2          0.00000         0.20118  0.114536   0.114536      0.00000  ...   
3          0.00000         0.00000  0.000000   0.152715      0.00000  ...   
4          0.26824         0.00000  0.000000   0.000000      0.00000  ...   

   (sat, around)  (the, children)  (the, elder)  (the, family)

# Write a simple spark program to read a dataset and find the W2V Synonyms for the Top TF-IDF Words

## Try without NLP

In [39]:
from __future__ import print_function
from pyspark import SparkContext
from pyspark.mllib.feature import HashingTF, IDF

In [40]:
sc = SparkContext.getOrCreate()
documents = sc.textFile("data.txt").map(lambda line: line.split(" "))

In [41]:
hashingTF = HashingTF(numFeatures=20)
tf = hashingTF.transform(documents)

tf.cache()
idf = IDF().fit(tf)
tfidf = idf.transform(tf)

In [42]:
print("TFIDF without NLP:")
for each in tfidf.collect():
    print(each)
sc.stop()

TFIDF without NLP:
(20,[7,9,12,17],[0.1823215567939546,2.0794415416798357,0.3646431135879092,0.0])
(20,[2,7,8,17,18],[0.6931471805599453,0.1823215567939546,0.6931471805599453,0.0,0.6931471805599453])
(20,[0,1,7,9,12,15,17],[1.0986122886681098,0.6931471805599453,0.1823215567939546,1.3862943611198906,0.1823215567939546,1.0986122886681098,0.0])
(20,[1,7,11,12,17],[0.6931471805599453,0.3646431135879092,1.0986122886681098,0.1823215567939546,0.0])
(20,[2,5,8,12,17,18],[0.6931471805599453,1.0986122886681098,0.6931471805599453,0.1823215567939546,0.0,0.6931471805599453])


## Try with Lemmatization

In [43]:
from __future__ import print_function

import nltk
nltk.download('punkt')
from pyspark import SparkContext
from pyspark.mllib.feature import HashingTF, IDF
from nltk.stem import WordNetLemmatizer

sc = SparkContext.getOrCreate()
documents = sc.textFile("data.txt").map(lambda line: line.split(" "))

lemmatizer = WordNetLemmatizer()

word_list = list(map(' '.join, documents.collect()))
word_list1 = ''
for i in word_list:
    word_list1 = word_list1 + ' ' + i
word_list2 = nltk.word_tokenize(word_list1)
lemmatized_document = ' '.join([lemmatizer.lemmatize(w) for w in word_list2])
print(lemmatized_document)

f = open("data1.txt", "w+")
f.write('' + lemmatized_document)
f.close()

document1 = sc.textFile("data.txt").map(lambda line: line.split(" "))

hashingTF = HashingTF(numFeatures=20)
tf = hashingTF.transform(document1)
tf.cache()
idf = IDF().fit(tf)
tfidf = idf.transform(tf)

print("TFIDF with Lemmatization:")
for each in tfidf.collect():
    print(each)
sc.stop()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Michael\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


the man went out for a walk the child sat around the fire the family went for a picnic in the park the child were running in the park the elder people sat around the fire
TFIDF with Lemmatization:
(20,[7,9,12,17],[0.1823215567939546,2.0794415416798357,0.3646431135879092,0.0])
(20,[2,7,8,17,18],[0.6931471805599453,0.1823215567939546,0.6931471805599453,0.0,0.6931471805599453])
(20,[0,1,7,9,12,15,17],[1.0986122886681098,0.6931471805599453,0.1823215567939546,1.3862943611198906,0.1823215567939546,1.0986122886681098,0.0])
(20,[1,7,11,12,17],[0.6931471805599453,0.3646431135879092,1.0986122886681098,0.1823215567939546,0.0])
(20,[2,5,8,12,17,18],[0.6931471805599453,1.0986122886681098,0.6931471805599453,0.1823215567939546,0.0,0.6931471805599453])


# Try with N-Grams

In [44]:
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd

txt1 = []
with open('data1.txt') as file:
    txt1 = file.readlines()

def remove_string_special_characters(s):
    stripped = re.sub('[^a-zA-z\s]', '', s)
    stripped = re.sub('_', '', stripped)
    stripped = re.sub('\s+', ' ', stripped)
    stripped = stripped.strip()
    if stripped != '':
        return stripped.lower()

stop_words = set(stopwords.words('english'))
your_list = ['skills', 'ability', 'job', 'description']
for i, line in enumerate(txt1):
    txt1[i] = ' '.join([x for
                        x in nltk.word_tokenize(line) if
                        (x not in stop_words) and (x not in your_list)])

vectorizer = CountVectorizer(ngram_range=(2, 2))
X1 = vectorizer.fit_transform(txt1)
features = (vectorizer.get_feature_names())
print("\n\nFeatures : \n", features)
print("\n\nX1 : \n", X1.toarray())

vectorizer = TfidfVectorizer(ngram_range=(3, 3))
X2 = vectorizer.fit_transform(txt1)
scores = (X2.toarray())
print("\n\nScores : \n", scores)

# Getting top ranking features 
sums = X2.sum(axis=0)
data1 = []
for col, term in enumerate(features):
    data1.append((term, sums[0, col]))
ranking = pd.DataFrame(data1, columns=['term', 'rank'])
words = (ranking.sort_values('rank', ascending=False))
print("\n\nWords head : \n", words.head(7))



Features : 
 ['around fire', 'child running', 'child sat', 'elder people', 'family went', 'fire family', 'man went', 'park child', 'park elder', 'people sat', 'picnic park', 'running park', 'sat around', 'walk child', 'went picnic', 'went walk']


X1 : 
 [[2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1]]


Scores : 
 [[0.22941573 0.22941573 0.22941573 0.22941573 0.22941573 0.22941573
  0.22941573 0.22941573 0.22941573 0.22941573 0.22941573 0.22941573
  0.45883147 0.22941573 0.22941573 0.22941573]]


Words head : 
              term      rank
12     sat around  0.458831
0     around fire  0.229416
1   child running  0.229416
2       child sat  0.229416
3    elder people  0.229416
4     family went  0.229416
5     fire family  0.229416
