<a href="https://colab.research.google.com/github/Fizza-Rubab/Keyword-Identifier/blob/main/keyword_identifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Keyword Identifier using TF-IDF Scores

### Relevant Imports

In [None]:
import math
from textblob import TextBlob as tb
import nltk


In [None]:
def tf(word, blob):
    return blob.words.count(word) / len(blob.words)

def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob.words)

def idf(word, bloblist):
    return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))

def tfidf(word, blob, bloblist):
    return tf(word, blob) * idf(word, bloblist)

In [None]:
import pandas as pd
df = pd.read_excel('hscode_description (1).xlsx', converters={'HSCODE':str}, usecols=['Hscode', "Description to English"])

In [None]:
df = df.dropna()

In [None]:
import re
def pre_process(text):
    text=text.lower()
    text=re.sub("</?.*?>"," <> ",text)
    text=re.sub("(\\d|\\W)+"," ",text)
    return text

In [None]:
df.head()

Unnamed: 0,Hscode,Description to English
0,19053100000,19053100000-SWEET BISCUITS WHETHER OR NOT CONT...
1,42022220,"40 Carton (10,000 Pcs) COTTON BAGS(DETAI"
2,30049099,607077709237 PERSONAL MEDECINE
3,84433990,TM M30II 122 SMALL PRINTER
4,92011000,"Vertical Piano- PIANO TYPE: UPRIGHT, MAKER: YA..."


In [None]:
df['Description to English'] = df['Description to English'].apply(lambda x: pre_process(x))

In [None]:
bloblist = [tb(i) for i in df['Description to English'].to_list()]

In [None]:
lst = []

In [None]:
for i, blob in enumerate(bloblist):
    l = [blob]
    print("Top words in document {}".format(i + 1))
    scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    try:
        for word, score in sorted_words[:5]:
            x = "\tWord: {}, TF-IDF: {}".format(word, round(score, 5))
            l.append(x)
            print(x)
        lst.append(l)
    except:
        continue
    break

Top words in document 1
	Word: biscuits, TF-IDF: 1.52881
	Word: sweet, TF-IDF: 1.02219
	Word: containg, TF-IDF: 0.91997
	Word: cocoa, TF-IDF: 0.69609
	Word: whether, TF-IDF: 0.67179


## Writing to a CSV File

In [None]:
import csv
with open('result.csv', 'w') as f:
    write = csv.writer(f)
    write.writerows(lst)

In [None]:
def get_keywords(txt):
    global bloblist
    txt = tb(pre_process(txt))
    nouns = [n for n,t in txt.tags if t in ['NN', 'NNS','NNP','NNPS','JJ']]
    scores = {word: tfidf(word, txt, bloblist) for word in txt.words if word}
    noun_scores = {n:scores[n] for n in nouns}
    sorted_words = sorted(noun_scores.items(), key=lambda x: x[1], reverse=True)
    return sorted_words[:5]



In [None]:
txt = " other woven fabrics containing polyester fibres cotton weight g m other woven fabrics containing polyester fibres cotton weight g m textile material"
get_keywords(txt)

[('other', 'JJ'), ('woven', 'JJ'), ('fabrics', 'NNS'), ('containing', 'VBG'), ('polyester', 'NN'), ('fibres', 'NNS'), ('cotton', 'NN'), ('weight', 'VBD'), ('g', 'NNS'), ('m', 'FW'), ('other', 'JJ'), ('woven', 'JJ'), ('fabrics', 'NNS'), ('containing', 'VBG'), ('polyester', 'NN'), ('fibres', 'NNS'), ('cotton', 'NN'), ('weight', 'VBD'), ('g', 'JJ'), ('m', 'NN'), ('textile', 'NN'), ('material', 'NN')]


[('fibres', 0.6074055182707181),
 ('fabrics', 0.39486834974122226),
 ('polyester', 0.39224438837273734),
 ('woven', 0.3678567532277665),
 ('m', 0.35573585123799695)]