# Tokenization
Tokenization is the process of breaking down the given text in natural language processing into the smallest unit in a sentence called a token. Punctuation marks, words, and numbers can be considered tokens.

In [1]:
text = 'Hi Everyone! This is Arafat Hossain. We are learning Natural Language Processing. We reached 1000 views.'


In [2]:
text.split(' ')

['Hi',
 'Everyone!',
 'This',
 'is',
 'Arafat',
 'Hossain.',
 'We',
 'are',
 'learning',
 'Natural',
 'Language',
 'Processing.',
 'We',
 'reached',
 '1000',
 'views.']

In [3]:
from nltk import sent_tokenize, word_tokenize

In [4]:
# split the text into sentences
sent_tokens = sent_tokenize(text)
sent_tokens

['Hi Everyone!',
 'This is Arafat Hossain.',
 'We are learning Natural Language Processing.',
 'We reached 1000 views.']

In [5]:
# split the text into word
word_tokens = word_tokenize(text)
word_tokens


['Hi',
 'Everyone',
 '!',
 'This',
 'is',
 'Arafat',
 'Hossain',
 '.',
 'We',
 'are',
 'learning',
 'Natural',
 'Language',
 'Processing',
 '.',
 'We',
 'reached',
 '1000',
 'views',
 '.']

# Stemming
Stemming is the process of finding the root of words. A word stem need not be the same root as a dictionary-based morphological root, it just is an equal to or smaller form of the word.

In [6]:
from nltk.stem import PorterStemmer, SnowballStemmer
ps = PorterStemmer()

In [7]:
word = ('eats')
ps.stem(word)

'eat'

In [8]:
word = ('eating')
ps.stem(word)

'eat'

In [9]:
word = ('eaten')
ps.stem(word)

'eaten'

In [10]:
text = 'Hi Everyone! This is Arafat Hossain. We are learning Natural Language Processing. We reached 1000 views.'


In [11]:
word_tokens = word_tokenize(text)
word_tokens

['Hi',
 'Everyone',
 '!',
 'This',
 'is',
 'Arafat',
 'Hossain',
 '.',
 'We',
 'are',
 'learning',
 'Natural',
 'Language',
 'Processing',
 '.',
 'We',
 'reached',
 '1000',
 'views',
 '.']

In [12]:
stemmed_sentence = " ".join(ps.stem(word) for word in word_tokens)
stemmed_sentence

'hi everyon ! thi is arafat hossain . we are learn natur languag process . we reach 1000 view .'

# Lemmatization
Lemmatization is the process of finding the form of the related word in the dictionary. It is different from Stemming. It involves longer processes to calculate than Stemming.

In [13]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [14]:
lemmatizer.lemmatize('workers')

'worker'

In [15]:
lemmatizer.lemmatize('words')

'word'

In [16]:
lemmatizer.lemmatize('this')

'this'

In [17]:
lemmatizer.lemmatize('stripes')

'stripe'

In [18]:
lemmatizer.lemmatize('stripes', 'v') # v for verb

'strip'

In [19]:
lemmatizer.lemmatize('stripes', 'n') # n for noun

'stripe'

In [20]:
text = 'Hi Everyone! This is Arafats Hossain. We are learning Natural Language Processing. We reached 1000 views.'


In [21]:
word_tokens = word_tokenize(text)

In [22]:
lemmatized_sentence = " ".join(lemmatizer.lemmatize(word) for word in word_tokens)
lemmatized_sentence

'Hi Everyone ! This is Arafats Hossain . We are learning Natural Language Processing . We reached 1000 view .'

In [23]:
lemmatized_sentence = " ".join(lemmatizer.lemmatize(word.lower()) for word in word_tokens)
lemmatized_sentence

'hi everyone ! this is arafat hossain . we are learning natural language processing . we reached 1000 view .'

# Part of Speech Tagging (POS)
Part of Speech Tagging is a process of converting a sentence to forms — list of words, list of tuples (where each tuple is having a form (word, tag)). The tag in case of is a part-of-speech tag, and signifies whether the word is a noun, adjective, verb, and so on.

https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

In [24]:
from nltk import pos_tag


In [25]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/arafat/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [26]:
pos_tag(['fighting'])

[('fighting', 'VBG')]

In [27]:
text = 'Hi Everyone! This is Arafat Hossain. We are learning Natural Language Processing. We reached 1000 views.'


In [1]:
word_tokensr = word_tokenize(text)

NameError: name 'word_tokenize' is not defined

In [29]:
pos_tag(word_tokens)

[('Hi', 'NNP'),
 ('Everyone', 'NN'),
 ('!', '.'),
 ('This', 'DT'),
 ('is', 'VBZ'),
 ('Arafats', 'NNP'),
 ('Hossain', 'NNP'),
 ('.', '.'),
 ('We', 'PRP'),
 ('are', 'VBP'),
 ('learning', 'VBG'),
 ('Natural', 'NNP'),
 ('Language', 'NNP'),
 ('Processing', 'NNP'),
 ('.', '.'),
 ('We', 'PRP'),
 ('reached', 'VBD'),
 ('1000', 'CD'),
 ('views', 'NNS'),
 ('.', '.')]

# Text Preprocessing (Clean Data)


In [30]:
import pandas as pd
import string
df = pd.read_csv('Twitter Sentiments.csv')

pd.set_option("display.max_colwidth", None)
# drop the columns
df = df.drop(columns=['id', 'label'], axis=1)
df.head()

Unnamed: 0,tweet
0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run
1,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked
2,bihday your majesty
3,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦
4,factsguide: society now #motivation


# Converting to lowercase

In [31]:
df['clean_text'] = df['tweet'].str.lower()
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run
1,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked
2,bihday your majesty,bihday your majesty
3,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦
4,factsguide: society now #motivation,factsguide: society now #motivation


In [32]:
df.sample(frac=1).head() # shaffaling the tweet

Unnamed: 0,tweet,clean_text
15238,#kuturns15 bihdayð #loveyouð,#kuturns15 bihdayð #loveyouð
1721,@user pre-ordered @user #trulyhappybaby book today ð¶ð¼ #newmummy,@user pre-ordered @user #trulyhappybaby book today ð¶ð¼ #newmummy
23283,another #macro of the #sagopalm #tree it's fascinating to me! #green #healthy #plantsâ¦,another #macro of the #sagopalm #tree it's fascinating to me! #green #healthy #plantsâ¦
8360,"choose to be happy, no matter what your circumstances are! it will free you! #smile","choose to be happy, no matter what your circumstances are! it will free you! #smile"
2667,i'm maia! #fun #daughter #princess #toddler #maryland #silverspring #model @user,i'm maia! #fun #daughter #princess #toddler #maryland #silverspring #model @user


# Removal of Punctuations


In [33]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [34]:
def remove_punctuation(text):
    punctuations = string.punctuation
    return text.translate(str.maketrans('', '', punctuations))

In [35]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_punctuation(x))
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run,user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction run
1,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked,user user thanks for lyft credit i cant use cause they dont offer wheelchair vans in pdx disapointed getthanked
2,bihday your majesty,bihday your majesty
3,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦,model i love u take with u all the time in urð± ððððð¦ð¦ð¦
4,factsguide: society now #motivation,factsguide society now motivation


# Removal of Stopwords


In [36]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))

"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [37]:
STOPWORDS = set(stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in STOPWORDS])

In [38]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_stopwords(x))
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run,user father dysfunctional selfish drags kids dysfunction run
1,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked,user user thanks lyft credit cant use cause dont offer wheelchair vans pdx disapointed getthanked
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦,model love u take u time urð± ðððð ð¦ð¦ð¦
4,factsguide: society now #motivation,factsguide society motivation


# Removal of Frequent Words


In [39]:
from collections import Counter
word_count = Counter()

for text in df['clean_text']:
    for word in text.split():
        word_count[word] +=1

word_count.most_common(10)

[('user', 17473),
 ('love', 2647),
 ('day', 2198),
 ('happy', 1663),
 ('amp', 1582),
 ('im', 1139),
 ('u', 1136),
 ('time', 1110),
 ('life', 1086),
 ('like', 1042)]

In [40]:
FREQUENT_WORDS = set(word for (word, wc) in word_count.most_common(3))
FREQUENT_WORDS

{'day', 'love', 'user'}

In [41]:
FREQUENT_WORDS = set(word for (word, wc) in word_count.most_common(1))
FREQUENT_WORDS

{'user'}

In [42]:
def remove_freq_word(text):
    return " ".join([word for word in text.split() if word not in FREQUENT_WORDS])

In [43]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_freq_word(x))
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run,father dysfunctional selfish drags kids dysfunction run
1,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked,thanks lyft credit cant use cause dont offer wheelchair vans pdx disapointed getthanked
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦,model love u take u time urð± ðððð ð¦ð¦ð¦
4,factsguide: society now #motivation,factsguide society motivation


## Removal of Rare Words


In [44]:
RARE_WORDS = set(word for (word, wc) in word_count.most_common()[:-10:-1])
RARE_WORDS

{'airwaves',
 'carnt',
 'chisolm',
 'ibizabringitonmallorcaholidayssummer',
 'isz',
 'mantle',
 'shirley',
 'youuuð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dâ\x9d¤ï¸\x8f',
 'ð\x9f\x99\x8fð\x9f\x8f¼ð\x9f\x8d¹ð\x9f\x98\x8eð\x9f\x8eµ'}

In [45]:
RARE_WORDS = set((word, wc) for (word, wc) in word_count.most_common()[:-10:-1])
RARE_WORDS

{('airwaves', 1),
 ('carnt', 1),
 ('chisolm', 1),
 ('ibizabringitonmallorcaholidayssummer', 1),
 ('isz', 1),
 ('mantle', 1),
 ('shirley', 1),
 ('youuuð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dâ\x9d¤ï¸\x8f',
  1),
 ('ð\x9f\x99\x8fð\x9f\x8f¼ð\x9f\x8d¹ð\x9f\x98\x8eð\x9f\x8eµ', 1)}

In [46]:
def remove_rare_words(text):
    return " ".join([word for word in text.split() if word not in RARE_WORDS])


In [47]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_rare_words(x))
df.head()


Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run,father dysfunctional selfish drags kids dysfunction run
1,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked,thanks lyft credit cant use cause dont offer wheelchair vans pdx disapointed getthanked
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦,model love u take u time urð± ðððð ð¦ð¦ð¦
4,factsguide: society now #motivation,factsguide society motivation


# Removal of Special characters


In [48]:
import re
def remove_spl_chars(text):
    text = re.sub('[^a-zA-Z0-9]', ' ', text)
    text = re.sub('\s+', ' ', text)
    return text

In [49]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_spl_chars(x))
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run,father dysfunctional selfish drags kids dysfunction run
1,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked,thanks lyft credit cant use cause dont offer wheelchair vans pdx disapointed getthanked
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦,model love u take u time ur
4,factsguide: society now #motivation,factsguide society motivation


# Stemming



In [50]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [51]:
df['stemmed_text'] = df['clean_text'].apply(lambda x: stem_words(x))
df.head()

Unnamed: 0,tweet,clean_text,stemmed_text
0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run,father dysfunctional selfish drags kids dysfunction run,father dysfunct selfish drag kid dysfunct run
1,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked,thanks lyft credit cant use cause dont offer wheelchair vans pdx disapointed getthanked,thank lyft credit cant use caus dont offer wheelchair van pdx disapoint getthank
2,bihday your majesty,bihday majesty,bihday majesti
3,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦,model love u take u time ur,model love u take u time ur
4,factsguide: society now #motivation,factsguide society motivation,factsguid societi motiv


# Lemmatization & POS Tagging


In [52]:
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

lammatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}

def lemmatize_words(text):
    #find pos tags
    
    pos_text = pos_tag(text.split())
    return " ".join(lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_text)

In [53]:
wordnet.NOUN

'n'

In [54]:
df['lemmatized_text'] = df['clean_text'].apply(lambda x: lemmatize_words(x))
df.head()

Unnamed: 0,tweet,clean_text,stemmed_text,lemmatized_text
0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run,father dysfunctional selfish drags kids dysfunction run,father dysfunct selfish drag kid dysfunct run,father dysfunctional selfish drag kid dysfunction run
1,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked,thanks lyft credit cant use cause dont offer wheelchair vans pdx disapointed getthanked,thank lyft credit cant use caus dont offer wheelchair van pdx disapoint getthank,thanks lyft credit cant use cause dont offer wheelchair van pdx disapointed getthanked
2,bihday your majesty,bihday majesty,bihday majesti,bihday majesty
3,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦,model love u take u time ur,model love u take u time ur,model love u take u time ur
4,factsguide: society now #motivation,factsguide society motivation,factsguid societi motiv,factsguide society motivation


In [55]:
df.sample(frac=1).head(10)

Unnamed: 0,tweet,clean_text,stemmed_text,lemmatized_text
23919,"@user fathers day. have a memorable day. from all 1950's waspi women who bore you, raised you, fed you, cared for you,â¦",fathers day memorable day 1950s waspi women bore raised fed cared you,father day memor day 1950 waspi women bore rais fed care you,father day memorable day 1950s waspi woman bore raised fed care you
6948,"@user @user #ineedtoconsult bro t, i have lot of un-answered questions about my life but the only person to be consultâ¦",ineedtoconsult bro lot unanswered questions life person consult,ineedtoconsult bro lot unansw question life person consult,ineedtoconsult bro lot unanswered question life person consult
2075,@user don't miss the chance of your life to be ! *happy divorce* e-book &amp; paperback,dont miss chance life happy divorce ebook amp paperback,dont miss chanc life happi divorc ebook amp paperback,dont miss chance life happy divorce ebook amp paperback
21867,i'm afraid imma need longer sessions than my normal sessions. loool #enjoyment #gymtime,im afraid imma need longer sessions normal sessions loool enjoyment gymtime,im afraid imma need longer session normal session loool enjoy gymtim,im afraid imma need long session normal session loool enjoyment gymtime
4685,"we have to make a decision to be on the inside now, to magnetize a life of happiness on the outside.",make decision inside magnetize life happiness outside,make decis insid magnet life happi outsid,make decision inside magnetize life happiness outside
29239,"well,these sewer rats don't take a cent from me,....i don't use to watch ""football"",......but now,i hate it!",wellthese sewer rats dont take cent mei dont use watch footballbut nowi hate,wellthes sewer rat dont take cent mei dont use watch footballbut nowi hate,wellthese sewer rat dont take cent mei dont use watch footballbut nowi hate
30269,yeah!!!!!!!!!!! #allblacks #allblackeverything #nzlvwal,yeah allblacks allblackeverything nzlvwal,yeah allblack allblackeveryth nzlvwal,yeah allblacks allblackeverything nzlvwal
11164,nonegativeselftalk: #money &amp; status can't cure low self-esteem or make you . learn why in this post. â¦,nonegativeselftalk money amp status cant cure low selfesteem make learn post,nonegativeselftalk money amp statu cant cure low selfesteem make learn post,nonegativeselftalk money amp status cant cure low selfesteem make learn post
9416,â¡ï¸ âcharles paladino's racist comments spark calls for resignationâ,charles paladinos racist comments spark calls resignation,charl paladino racist comment spark call resign,charles paladinos racist comment spark call resignation
5784,are you #black &amp; feel like the are stomping â¦ you? listen #retweet #tampa #miamiâ¦,black amp feel like stomping listen retweet tampa miami,black amp feel like stomp listen retweet tampa miami,black amp feel like stomp listen retweet tampa miami


In [56]:
 
word_count =0

for text in df['tweet']:
    for word in text.split():
        word_count +=1

word_count

420579

In [57]:
word_count1 =0 

for text in df['clean_text']:
    for word in text.split():
        word_count1 +=1
word_count1

254461

In [58]:

word_count2 = 0

for text in df['stemmed_text']:
    for word in text.split():
        word_count2 +=1

word_count2

254461

# Removal of URLs

In [59]:
text = "https://www.abcd.net is the URL of abcd"


In [60]:
def remove_url(text):
    return re.sub(r'https?://\S+|www\.\S+','',text)

In [61]:
remove_url(text)

' is the URL of abcd'

# Removal of HTML Tags


In [62]:
text = "<html><body> <h1>Arafat Hossain</h1> <p>This is a sample text</p> </body></html>"


In [63]:

def remove_html_tags(text):
    return re.sub(r'<.*?>', '', text)

In [64]:
remove_html_tags(text)


' Arafat Hossain This is a sample text '

# Spelling Correction


In [65]:
!pip install pyspellchecker



In [66]:
text = 'natur is a beuty'
text = 'helllo, m am Arafat. i\'m lerning englsh'

In [67]:
from spellchecker import SpellChecker
spell = SpellChecker()

def correct_spellings(text):
    corrected_text = []
    misspelled_text = spell.unknown(text.split())
    print(f'Misspelled word : {misspelled_text}')
    for word in text.split():
        if word in misspelled_text:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)

In [68]:
correct_spellings(text)

Misspelled word : {'lerning', 'helllo,', 'englsh', 'arafat.', 'm'}


"hello i am Arafat. i'm learning english"

# Feature Extraction from Text Data
### Bag of Words

A bag-of-words is a representation of text that describes the occurrence of words within a document. It involves two things: A vocabulary of known words. A measure of the presence of known words.



In [69]:
text_data = ['I am interested in NLP', 'This is a good tutorial with good topic', 'Feature extraction is very important topic']


In [70]:
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(stop_words='english')

In [71]:
# fit the data
bow.fit(text_data)

CountVectorizer(stop_words='english')

In [72]:
# get the vocabulary list
bow.get_feature_names()



['extraction',
 'feature',
 'good',
 'important',
 'interested',
 'nlp',
 'topic',
 'tutorial']

In [73]:
bow_features = bow.transform(text_data)
bow_features

<3x8 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [74]:
bow_feature_array = bow_features.toarray()
bow_feature_array

array([[0, 0, 0, 0, 1, 1, 0, 0],
       [0, 0, 2, 0, 0, 0, 1, 1],
       [1, 1, 0, 1, 0, 0, 1, 0]])

In [75]:
print(bow.get_feature_names())
for sentence, feature in zip(text_data, bow_feature_array):
    print(sentence)
    print(feature)

['extraction', 'feature', 'good', 'important', 'interested', 'nlp', 'topic', 'tutorial']
I am interested in NLP
[0 0 0 0 1 1 0 0]
This is a good tutorial with good topic
[0 0 2 0 0 0 1 1]
Feature extraction is very important topic
[1 1 0 1 0 0 1 0]


In [76]:
bow_feature_array.shape

(3, 8)

# TF-IDF (Term Frequency/Inverse Document Frequency)

TF-IDF stands for term frequency-inverse document frequency and it is a measure, used in the fields of information retrieval (IR) and machine learning, that can quantify the importance or relevance of string representations (words, phrases, lemmas, etc) in a document amongst a collection of documents

In [77]:
text_data = ['I am interested in NLP', 'This is a good tutorial with good topic', 'Feature extraction is very important topic']
text_data = ['good boy','good girl', 'boy girl good']


In [78]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')

In [79]:
# fit the data
tfidf.fit(text_data)

TfidfVectorizer(stop_words='english')

In [80]:
# get the vocabulary list
tfidf.vocabulary_

{'good': 2, 'boy': 0, 'girl': 1}

In [81]:
tfidf_features = tfidf.transform(text_data)
tfidf_features

<3x3 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [82]:
tfidf_feature_array = tfidf_features.toarray()
tfidf_feature_array

array([[0.78980693, 0.        , 0.61335554],
       [0.        , 0.78980693, 0.61335554],
       [0.61980538, 0.61980538, 0.48133417]])

In [83]:
for sentence, feature in zip(text_data, tfidf_features):
    print(sentence)
    print(feature)

good boy
  (0, 2)	0.6133555370249717
  (0, 0)	0.7898069290660905
good girl
  (0, 2)	0.6133555370249717
  (0, 1)	0.7898069290660905
boy girl good
  (0, 2)	0.48133416873660545
  (0, 1)	0.6198053799406072
  (0, 0)	0.6198053799406072


# Word2vec
The word2vec algorithm uses a neural network model to learn word associations from a large corpus of text. Once trained, such a model can detect synonymous words or suggest additional words for a partial sentence.

In [106]:
!pip install gensim



In [107]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

In [108]:
# text data
common_texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [113]:
type(common_texts)

list

In [110]:
# initialize and fit the data
model = Word2Vec(common_texts, min_count=1)

In [111]:
model.wv['graph']

array([-8.6196875e-03,  3.6657380e-03,  5.1898835e-03,  5.7419371e-03,
        7.4669169e-03, -6.1676763e-03,  1.1056137e-03,  6.0472824e-03,
       -2.8400517e-03, -6.1735227e-03, -4.1022300e-04, -8.3689503e-03,
       -5.6000138e-03,  7.1045374e-03,  3.3525396e-03,  7.2256685e-03,
        6.8002464e-03,  7.5307419e-03, -3.7891555e-03, -5.6180713e-04,
        2.3483753e-03, -4.5190332e-03,  8.3887316e-03, -9.8581649e-03,
        6.7646410e-03,  2.9144168e-03, -4.9328329e-03,  4.3981862e-03,
       -1.7395759e-03,  6.7113829e-03,  9.9648498e-03, -4.3624449e-03,
       -5.9933902e-04, -5.6956387e-03,  3.8508223e-03,  2.7866268e-03,
        6.8910765e-03,  6.1010956e-03,  9.5384959e-03,  9.2734173e-03,
        7.8980681e-03, -6.9895051e-03, -9.1558648e-03, -3.5575390e-04,
       -3.0998420e-03,  7.8943158e-03,  5.9385728e-03, -1.5456629e-03,
        1.5109634e-03,  1.7900396e-03,  7.8175711e-03, -9.5101884e-03,
       -2.0553112e-04,  3.4691954e-03, -9.3897345e-04,  8.3817719e-03,
      

In [112]:
model.wv.most_similar('graph')

[('user', 0.06793875992298126),
 ('survey', 0.03364057466387749),
 ('eps', 0.009391184896230698),
 ('human', 0.008315940387547016),
 ('minors', 0.004503006115555763),
 ('system', -0.010839183814823627),
 ('trees', -0.023671666160225868),
 ('computer', -0.09575347602367401),
 ('time', -0.11410722136497498),
 ('response', -0.11557211726903915)]