### Converting csv to txt

In [1]:
import csv

with open('BooksDataSet.csv', 'r',encoding="utf-8") as csv_file:
    csv_reader = csv.reader(csv_file)
    
    with open('BookSummaryDataset.txt', 'w',encoding="utf-8") as txt_file:
        for row in csv_reader:
            txt_file.write('\t'.join(row) + '\n')
with open('BookSummaryDataset.txt', 'r',encoding="utf-8") as txt_file:
        print(txt_file.read())

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



### Converting txt to csv

In [2]:
import pandas as pd
df=pd.read_csv("BookSummaryDataset.txt",delimiter='\t')
df

Unnamed: 0,Doc_ID,Book_Name,Summary
0,0,Drowned Wednesday,Drowned Wednesday is the first Trustee among ...
1,1,The Lost Hero,"As the book opens, Jason awakens on a school ..."
2,2,The Eyes of the Overworld,Cugel is easily persuaded by the merchant Fia...
3,3,Magic's Promise,The book opens with Herald-Mage Vanyel return...
4,4,Taran Wanderer,Taran and Gurgi have returned to Caer Dallben...
...,...,...,...
2995,2995,White Death,"A Novel from the NUMA files, A Kurt Austin Ad..."
2996,2996,Venus with Pistol,Gilbert Kemp is dealer specializing in antiqu...
2997,2997,Blackwater,"""How do you know when you're in too deep? Dav..."
2998,2998,The Rainbow and the Rose,The story concerns the life of Johnnie Pascoe...


## Preprocessing of raw data

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Abhinav
[nltk_data]     Gunti\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Lowercasing

In [5]:
def lowercasing(df):
    df_lowercasing=df.copy()
    df_lowercasing['Book_Name'] = df['Book_Name'].apply(str.lower)
    df_lowercasing['Summary'] = df['Summary'].apply(str.lower)
    return df_lowercasing
df_lowercasing=lowercasing(df)
df_lowercasing

Unnamed: 0,Doc_ID,Book_Name,Summary
0,0,drowned wednesday,drowned wednesday is the first trustee among ...
1,1,the lost hero,"as the book opens, jason awakens on a school ..."
2,2,the eyes of the overworld,cugel is easily persuaded by the merchant fia...
3,3,magic's promise,the book opens with herald-mage vanyel return...
4,4,taran wanderer,taran and gurgi have returned to caer dallben...
...,...,...,...
2995,2995,white death,"a novel from the numa files, a kurt austin ad..."
2996,2996,venus with pistol,gilbert kemp is dealer specializing in antiqu...
2997,2997,blackwater,"""how do you know when you're in too deep? dav..."
2998,2998,the rainbow and the rose,the story concerns the life of johnnie pascoe...


In [None]:
print("After Loweringcasing : \n",df_lowercasing["Summary"][3],"\n\n")

### Tokenization

In [None]:
from nltk.tokenize import WordPunctTokenizer
# pd.set_option('mode.chained_assignment', None)

def tokenization(df_lowercasing):
    df_tokenized=df_lowercasing.copy()
    corpus=df_tokenized["Summary"].values
    tokenized_summaries = []
    for i in range(len(corpus)):
        # use loc accessor to modify original DataFrame directly
        tokens = WordPunctTokenizer().tokenize(corpus[i])
        tokenized_summaries.append(tokens)
    df_tokenized["Summary"] = tokenized_summaries
    return df_tokenized

df_tokenized=tokenization(df_lowercasing)
df_tokenized


In [None]:
print(" ".join(df_tokenized["Summary"][2997]),"\n",len(df_tokenized["Summary"][2997]))

In [None]:
print("After tokenization : \n"," ".join(df_tokenized["Summary"][3]),"\n",len(df_tokenized["Summary"][3]))

### Removing punctuation

In [None]:
import string
def remove_punctuation(df_tokenized):
    df_punc=df_tokenized.copy()
    translator =str.maketrans('', '', string.punctuation+" ")
    for i in range(len(df_tokenized['Summary'])):
        df_punc['Summary'][i] = [token.translate(translator) for token in df_tokenized['Summary'][i]]
        df_punc['Summary'][i] = list(filter(None, df_punc['Summary'][i])) #to remove space that got generated while removing punctuation

    return df_punc
df_punc=remove_punctuation(df_tokenized)
df_punc

In [None]:
print(" ".join(df_punc["Summary"][2997]),"\n",len(df_punc["Summary"][2997]))

In [None]:
print("After removing punctuation : \n"," ".join(df_punc["Summary"][3]),"\n",len(df_punc["Summary"][3]))

### Stopwords removal

In [None]:
from nltk.corpus import stopwords
print("List of all stopwords in english : \n",stopwords.words('english'))
print("Number of stopwords : ",len(stopwords.words('english')))

In [None]:
# print(df_punc['Summary'])
def avg_words_per_document(corpus,length):
    avg_words_per_doc=0
    for i in corpus:
        avg_words_per_doc+=len(i)
    avg_words_per_doc=avg_words_per_doc/length
    return avg_words_per_doc

print("Average number of words per document before stopword removal : ",avg_words_per_document(df_punc['Summary'],3000))

In [None]:
import threading
from functools import partial
from nltk.corpus import stopwords

def remove_stopwords(document):
    stop_words = set(stopwords.words('english'))
    return [word for word in document if word not in stop_words]

def process_documents(documents):
    threads = []
    for i, document in enumerate(documents):
        # create a partial function to call remove_stopwords with both i and document
        target_func = partial(remove_stopwords, document) #(func,list)
        thread = threading.Thread(target=lambda idx, func: documents.__setitem__(idx, func()), args=(i, target_func))#documents.__setitem__(idx, func()) will modify the list at specified index idx, setting it to thte result of calling function
        threads.append(thread)
        thread.start()
        
    for thread in threads:
        thread.join()

    return documents

df_sw=df_punc.copy()
corpus=df_sw['Summary']
df_sw['Summary'] = process_documents(corpus)
print(df_sw['Summary'])

In [None]:
df_sw['Summary']
print("Average number of words per document after stopword removal : ",avg_words_per_document(df_tokenized_sw['Summary'],3000))

In [None]:
print("After removing punctuation : \n"," ".join(df_punc["Summary"][3]),"\n",len(df_punc["Summary"][3]))

print("After removing stopwords : \n"," ".join(df_sw['Summary'][3]),"\n",len(df_sw["Summary"][3]))


### Stemming/Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
def lemmatization_df(df_sw):
    lemmatizer = WordNetLemmatizer()
    df_lemmaztized=df_sw.copy()
    for i in range(len(df_lemmaztized["Summary"])):
        df_lemmaztized['Summary'][i] = [lemmatizer.lemmatize(w) for w in df_sw["Summary"][i]]
    return df_lemmaztized
df_lemmaztized=lemmatization_df(df_sw)
df_lemmaztized

In [None]:
print("After Lemmatization : \n"," ".join(df_lemmaztized["Summary"][3]),"\n",len(df_lemmaztized["Summary"][3]))

## Inverted Index