# TF-IDF

In [30]:
Doc1 = 'Gangs of Wasseypur is a great movie. Wasseypur is a town in Bihar'
Doc2 = 'The success of a song depends on the music'
Doc3 = 'There is a movie releasing this week. The movie is fun to watch'
corpus = [Doc1,Doc2,Doc3]
corpus

['Gangs of Wasseypur is a great movie. Wasseypur is a town in Bihar',
 'The success of a song depends on the music',
 'There is a movie releasing this week. The movie is fun to watch']

# Preprocessing

## 1. Case folding

In [31]:
updated_corpus = []
for i in corpus:
    updated_corpus.append(i.casefold())
print(updated_corpus)

['gangs of wasseypur is a great movie. wasseypur is a town in bihar', 'the success of a song depends on the music', 'there is a movie releasing this week. the movie is fun to watch']


## 2. Tokenize

In [32]:
import nltk
#nltk.download('punkt')

[nltk_data] Error loading punkt: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [33]:
from nltk import word_tokenize
tokenized_list = []
for i in updated_corpus:
    tokenized_list.append(word_tokenize(i))
print(tokenized_list)

[['gangs', 'of', 'wasseypur', 'is', 'a', 'great', 'movie', '.', 'wasseypur', 'is', 'a', 'town', 'in', 'bihar'], ['the', 'success', 'of', 'a', 'song', 'depends', 'on', 'the', 'music'], ['there', 'is', 'a', 'movie', 'releasing', 'this', 'week', '.', 'the', 'movie', 'is', 'fun', 'to', 'watch']]


## 3. Stopwords Removal

In [34]:
from nltk.corpus import stopwords
#nltk.download('stopwords')

In [42]:
stop_words = set(stopwords.words('english'))
clear_words =[]
punct = '.!,'
for i in tokenized_list:
    clean_doc = []
    for j in i:
        if j not in stop_words and j not in punct:
            clean_doc.append(j)
    clear_words.append(clean_doc)
print(clear_words)

[['gangs', 'wasseypur', 'great', 'movie', 'wasseypur', 'town', 'bihar'], ['success', 'song', 'depends', 'music'], ['movie', 'releasing', 'week', 'movie', 'fun', 'watch']]


## 4. Stemming

In [36]:
from nltk.stem import PorterStemmer

In [43]:
stemmed_words = []
for i in clear_words:
    stem_doc = []
    for j in i:
        word = PorterStemmer().stem(j)
        stem_doc.append(word)
    stemmed_words.append(stem_doc)
print(stemmed_words)

[['gang', 'wasseypur', 'great', 'movi', 'wasseypur', 'town', 'bihar'], ['success', 'song', 'depend', 'music'], ['movi', 'releas', 'week', 'movi', 'fun', 'watch']]


## 5. Frequency Count

In [38]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [49]:
def duplicate(doc):
    return doc
count_words = CountVectorizer(tokenizer = duplicate,preprocessor=duplicate)
word_count_vector = count_words.fit_transform(stemmed_words)
count_words.get_feature_names()

['bihar',
 'depend',
 'fun',
 'gang',
 'great',
 'movi',
 'music',
 'releas',
 'song',
 'success',
 'town',
 'wasseypur',
 'watch',
 'week']

In [50]:
word_count_vector.toarray()

array([[1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 2, 0, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 2, 0, 1, 0, 0, 0, 0, 1, 1]], dtype=int64)

## 6. TF - IDF MATRIX

In [54]:
tfidf_conv = TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_conv_fit = tfidf_conv.fit(word_count_vector)
tfidf_conv.idf_

array([1.69314718, 1.69314718, 1.69314718, 1.69314718, 1.69314718,
       1.28768207, 1.69314718, 1.69314718, 1.69314718, 1.69314718,
       1.69314718, 1.69314718, 1.69314718, 1.69314718])

In [55]:
tfidf_vector = tfidf_conv_fit.transform(word_count_vector)
tfidf_vector.toarray()

array([[0.34142622, 0.        , 0.        , 0.34142622, 0.34142622,
        0.25966344, 0.        , 0.        , 0.        , 0.        ,
        0.34142622, 0.68285244, 0.        , 0.        ],
       [0.        , 0.5       , 0.        , 0.        , 0.        ,
        0.        , 0.5       , 0.        , 0.5       , 0.5       ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.39798027, 0.        , 0.        ,
        0.60534851, 0.        , 0.39798027, 0.        , 0.        ,
        0.        , 0.        , 0.39798027, 0.39798027]])

## 7. Dataframe using Pandas

In [56]:
import pandas 

In [64]:
names = count_words.get_feature_names()
dataframe = pandas.DataFrame

def ind(n):
    for i in range(1,n+1):
        return('doc',i,'tf-idf')
    
dataframe = pandas.DataFrame(tfidf_vector.toarray(),index=ind(len(corpus)),columns=names)
dataframe

Unnamed: 0,bihar,depend,fun,gang,great,movi,music,releas,song,success,town,wasseypur,watch,week
doc,0.341426,0.0,0.0,0.341426,0.341426,0.259663,0.0,0.0,0.0,0.0,0.341426,0.682852,0.0,0.0
1,0.0,0.5,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.5,0.0,0.0,0.0,0.0
tf-idf,0.0,0.0,0.39798,0.0,0.0,0.605349,0.0,0.39798,0.0,0.0,0.0,0.0,0.39798,0.39798
