In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing

In [None]:
df = pd.read_csv('papers.csv')
df = df.iloc[:5000,:]

In [None]:
df.head(10)

In [None]:
df.shape

In [None]:
df.isnull().sum()

# Preprocessing Data
# Working With "paper text"

# Steps to do

1 Lower case

2 remove HTML tags

3 remove special characters and digits

4 Convert to list from string

5 remove stopwords

6 remove words less than three letters

7 lemmatize

In [None]:
df['paper_text'][0]

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [None]:
stop_words

In [None]:
len(stop_words)

In [None]:
new_stop_words = ["fig","figure","image","sample","using", 
             "show", "result", "large", 
             "also", "one", "two", "three", 
             "four", "five", "seven","eight","nine"]

In [None]:
stop_words = list(stop_words.union(new_stop_words))

In [None]:
len(stop_words)

In [None]:
def preprocessing_text(txt):
    txt = txt.lower()
#     .)r'<.*>' is a regular expression pattern. In this case, it matches any 
# substring that starts with <, followed by any character (.* matches any character
# any number of times), and ends with >. This pattern is used to match HTML
# or XML-like tags (e.g., <tag> or <div>) because .* is a greedy match and 
# will match everything between < and >.
    txt = re.sub(r'<.*>',' ',txt)
    txt = re.sub(r'[^a-zA-Z]',' ',txt)
    
    #Tokenization
    txt = nltk.word_tokenize(txt)
    
    #Removing stopwords 
    txt = [word for word in txt if word not in stop_words]
    
#     .0Accepting words that charcter are greater then 3.
    txt = [word for word in txt if len(word) >3]
    
#     stemming
    stemming = PorterStemmer()
    txt = [stemming.stem(word) for word in txt]

    return ' '.join(txt)

In [None]:
preprocessing_text('This is python <hcsjngb> %^&*$ ')

In [None]:
doc = df['paper_text'].apply(lambda x:preprocessing_text(x))

# CountVectorizer
Explanation of the parameters used in CountVectorizer:

max_df: This parameter represents the maximum document frequency.
It ignores terms that have a document frequency strictly higher
than the given threshold (here, 0.95 means terms appearing in 
more than 95% of the documents will be ignored).

max_features: It indicates the maximum number of features
(or words/vocabulary) to be extracted. In this case, it's set to 5000,
meaning only the top 5000 most frequent words will be used as features.

ngram_range: This parameter specifies the lower and upper boundary 
of the range for n-grams to be extracted. In the corrected code,
ngram_range=(1, 3) means it will extract unigrams, bigrams,
and trigrams from the text data.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer 
cv = CountVectorizer(max_df=0.95,max_features=5000,ngram_range=(1, 2))
word_count_vectors = cv.fit_transform(doc)

# Using TF-IDF

.)TF-IDF stands for Text Frequency Inverse Document Frequency.

The importance of each word increases in proportion to the number 

of times a word appears in the document (Text Frequency – TF) but 

is offset by the frequency of the word in the corpus (Inverse Document Frequency – IDF).

./)Using the tf-idf weighting scheme, the keywords are

the words with the highest TF-IDF score.

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
# .)smooth_idf=True: This parameter, when set to True, adds 1
#     to the document frequencies (IDF smoothing) to prevent zero divisions.

# .)use_idf=True: This parameter, when set to True, enables IDF and it wil give the high
#  value to that worwds whose freq is less.
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vectors)

In [None]:
a = 10
a

# EXtracting Keywords
Getting Features Names 

Word Counts for user Docs 

Sorting Sparse matrix cordinated 

Extracting top 10 keywords

In [None]:
Functions:
sort_coo(coo_matrix):

This function takes a sparse COO (Coordinate List) matrix (coo_matrix) as input.
It uses zip() to pair the column indices (coo_matrix.col) with their corresponding 
data values (coo_matrix.data).
It then sorts these pairs based on the values (x[1]) in descending order (reverse=True)
and returns the sorted list of tuples.
extract_topn_from_vector(feature_names, sorted_items, topn=10):

    
This function extracts the top 'n' items from a sorted list of tuples (sorted_items) 
containing index-value pairs.
It rounds the scores to three decimal places and creates two lists: score_vals
    for scores and feature_vals for feature names.
It generates a dictionary results mapping feature names to their corresponding 
scores, limiting the entries to the top 'n' items.


Main Code:
Fetching Feature Names: feature_names = cv.get_feature_names_out() retrieves the
    feature names (vocabulary) from a CountVectorizer (cv).

    
get_keywords(idx, docs):

This function:
Generates the TF-IDF vector for the document at index idx from the provided list of documents (docs).
Sorts the TF-IDF vectors by descending order of scores.
Extracts the top 10 keywords for the given document using the previously defined functions.
print_results(idx, keywords, df):

This function prints the title, abstract, and extracted keywords for a specified index (idx) in the provided DataFrame (df).
Execution:

idx = 941: Sets the index for which keywords are to be extracted.
keywords = get_keywords(idx, docs): Retrieves the top keywords for the document 
    at index 941.
print_results(idx, keywords, df): Prints the title, abstract, and extracted keywords
    for the document at index 941.
This code overall defines functions to extract keywords from text documents using
TF-IDF representation and then prints out these extracted keywords along with 
information about a specific document from the provided dataset. Adjust the idx
variable to test it with different documents in your dataset.

In [None]:
def sort_coo(coo_matrix):
#     .)The blw coo_matrix.col means the cols which are 5000 of vocablary and 
#     coo_matrix.data is the tfidf values for the words which appers less time it 
#     will make it interger value high.
# .)The blw wuill zipd the dic each word with its value of tfidf.
    tuples = zip(coo_matrix.col, coo_matrix.data)
#     .0And blw the tuple of zip will be sorted acording to the int valyue of tfidf in 
#     descending order so the big hih score velue will come first.
    return sorted(tuples, key=lambda x: (x[1]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    #taking top items from vector
    sorted_items = sorted_items[:topn]
    
    score_vals = []
    feature_vals = []
    for idx, score in sorted_items:
        fname = feature_names[idx]
        score_vals.append(round(score,3))
        feature_vals.append(feature_names[idx])
    
    #create a tuples of features,score
    results = {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]] = score_vals[idx]  # Fix: Changed '==' to '='
    return results


# get feature names
feature_names=cv.get_feature_names_out()

def get_keywords(idx, doc):

    #generate tf-idf for the given document
    tf_idf_vector=tfidf_transformer.transform(cv.transform([doc[idx]]))

    #sort the tf-idf vectors by descending order of scores
#     .)The blw tocool will convert the vector into 2 cordinated .
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)
    
    return keywords


def print_results(idx,keywords, df):
    # now print the results
    print("\n=====Title=====")
    print(df['title'][idx])
    print("\n=====Abstract=====")
    print(df['abstract'][idx])
    print("\n===Keywords===")
    for k in keywords:
        print(k,keywords[k])
idx=995
keywords=get_keywords(idx, doc)
print_results(idx,keywords, df)

In [None]:
# -----------just for info-----------
tf_idf_vector=tfidf_transformer.transform(cv.transform([doc[1]]))
docs_words_count = tf_idf_vector.tocoo()
tuples = zip(docs_words_count.col, docs_words_count.data)
c = list(tuples)
c
# sorted_items = sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)



In [None]:
# -----------just for info-----------
tf_idf_vector=tfidf_transformer.transform(cv.transform([doc[1]]))
docs_words_count = tf_idf_vector.tocoo()
tuples = zip(docs_words_count.col, docs_words_count.data)
sorted_items = sorted(tuples, key=lambda x: (x[1]), reverse=True)
sorted_items


# Using pickle to save model and vectorizer.

In [None]:
import pickle 
pickle.dump('cv',open('count_vector.pkl','wb'))
pickle.dump('tfidf_transformer',open('tfidf_transformer.pkl','wb'))
pickle.dump('feature_names',open('feature_names.pkl','wb'))