In [None]:
import nltk
nltk.download('all')

## Reading File and creating DataFrame

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
import pickle
import nltk

In [None]:
import pandas as pd
wiki_files = pd.read_csv('wiki_sample.csv')
wiki_dataframe = pd.DataFrame(wiki_files)
wiki_dataframe

## Preprocessing and Creating Vocab

In [None]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
import pycountry
lemmatizer = WordNetLemmatizer()
from names_dataset import NameDatasetV1 # v1
names = NameDatasetV1()

stop_words = set(stopwords.words('english'))
# for domain_stop_word in domain_stop_words:
#     stop_words.add(domain_stop_word)

In [None]:
import pandas as pd
companies_file = pd.read_csv('companies_sorted.csv')
companies_dataframe = pd.DataFrame(companies_file)
companies_dataframe

In [None]:
companies = set(companies_dataframe['name'])

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
words = set(nltk.corpus.words.words())
lowerCasedWords = map(lambda word: word.lower(), nltk.corpus.words.words())
lowerCasedWords = set(list(lowerCasedWords))
for country in list(pycountry.countries):
    lowerCasedWords.add(country.name.lower())
for company in companies:
    lowerCasedWords.add(str(company).lower())

In [None]:
# lemmatization, lowercase, remove non alphanumeric, remove non-english, remove numbers and stopword removal
rejected_content = []
def apply_function(row, lenTitle):
    filtered_content = []
    for token in nltk.word_tokenize(row['content'][lenTitle:]):
        token = lemmatizer.lemmatize(token).lower()
        if names.search_first_name(token) or names.search_last_name(token) or ((token in lowerCasedWords) and (token not in stop_words) and (token.isalpha())):
            filtered_content.append(token) 
        else:
            rejected_content.append(token)
        
    return filtered_content

In [None]:
wiki_dataframe['tokenized_content'] = wiki_dataframe.progress_apply(lambda row: apply_function(row, len(row['title'])), axis=1)

In [None]:
wiki_dataframe.to_pickle('./wiki_dataframe_augmented_nltk_corpus_to_remove_non-english.pkl')

In [None]:
import pickle5 as pickle
with open("./wiki_dataframe_augmented_nltk_corpus_to_remove_non-english.pkl", "rb") as pickle_file:
    wiki_dataframe = pickle.load(pickle_file)
# wiki_dataframe = pd.read_pickle('./wiki_dataframe_augmented_nltk_corpus_to_remove_non-english.pkl')

In [None]:
wiki_dataframe

## Creating Part of Inverted Index

In [None]:
from collections import defaultdict
from collections import Counter

inv_idx = defaultdict(list)
vocab = set()
heaps_law_dataset = list()

most_freq = []
id = 1
total_words = 0
for document in tqdm(wiki_dataframe['tokenized_content']):
    counter = Counter(document)
    most_occur = counter.most_common(1)
    most_freq.append(most_occur)
    heaps_law_dataset.append((total_words, len(vocab)))
    for word in document:
        inv_idx[word].append(id)
        total_words += 1
        vocab.add(word)
    id +=1

In [None]:
wiki_dataframe['most_frequent_term'] = most_freq

In [None]:
wiki_dataframe

In [None]:
from collections import OrderedDict
inv_idx_ordered = OrderedDict(sorted(inv_idx.items(), key=lambda item: len(item[1]), reverse=True))

In [None]:
words = list(inv_idx_ordered.keys())
domain_stop_words = words[0:10000]

## Heaps Law

In [None]:
heaps_law_dataset[len(heaps_law_dataset)-1]

In [None]:
import matplotlib.pyplot as plt
import math

def heaps_law(list_to_graph):
        x = list()
        y = list()
        
        for item in list_to_graph:
            x.append(item[0])
            y.append(item[1])

        plt.plot(x, y)
        plt.xlim(1, x[-1])
        plt.ylim(1, y[-1])
        plt.savefig("heaps_law_words_from_nltk_english_corpus.png")

In [None]:
heaps_law(heaps_law_dataset)

## Zipfs Law

In [None]:
import matplotlib.pyplot as plt
import math

def zipfs_law(list_to_graph):
        x = list()
        y = list()
        
        for i, word in enumerate(list_to_graph):
            x.append(i+1)
            y.append(len(inv_idx_ordered[word]))
            

        plt.loglog(x, y)
        plt.savefig("zipfs_law.png")

In [None]:
zipfs_law(inv_idx_ordered)

## Finishing Inverted Index

In [None]:
from collections import Counter

most_occur = counter.most_common(1)
most_freq.append(most_occur)

In [None]:
for value in tqdm(inv_idx_ordered.items()):
    inv_idx[value[0]] = (Counter(value[1]))

In [None]:
file_to_write = open("inv_idx_augmented_nltk_corpus_to_remove_non-english.pkl", "wb")
pickle.dump(inv_idx, file_to_write)

## TF-IDF
#### 𝑇𝐹(𝑤, 𝑑) = 𝑓𝑟𝑒𝑞(𝑤, 𝑑) ÷ (𝑚𝑎𝑥_𝑑)
#### 𝐼𝐷𝐹(𝑤) = 𝑙𝑜𝑔__2 (𝑁 ÷ 𝑛_𝑤)

In [None]:
len(wiki_dataframe)

In [None]:
import math
def tf_idf(term, doc):
    return (inv_idx[term][doc] / most_freq[doc]) * math.log((len(wiki_dataframe) / len(inv_idx[term])), 2)

## Suggesting Queries

In [None]:
from os import listdir
from os.path import isfile, join
aol_query_log = pd.read_csv('project_1_AOL_query_log\Clean-Data-01.txt', sep="\t")
for file in listdir('project_1_AOL_query_log')[1:]:
    aol_query_log = aol_query_log.append(pd.read_csv('project_1_AOL_query_log\\' + str(file), sep="\t"), ignore_index=True)

In [None]:
aol_query_log

In [None]:
# lemmatizaiton, lowercase, remove non alphanumeric, remove non-english, remove numbers and stopword removal
def query_logs_preprocessing(row):
    filtered_content = []
    for token in nltk.word_tokenize(str(row['Query'])):
        token = lemmatizer.lemmatize(token).lower()
        if token not in stop_words and token.isalpha()):
            filtered_content.append(token) 
    
    return filtered_content

In [None]:
aol_query_log['Tokenized Query'] = aol_query_log.progress_apply(lambda row: query_logs_preprocessing(row), axis=1)

In [None]:
aol_queries = aol_query_log['Tokenized Query'].values
candidate_queries = []
def identify_candidate_queries(query):
    split_query = query.split()
    for index, aol_query in enumerate(aol_queries):
        if len(aol_query) > len(split_query):
            for term in split_query:
                if term in aol_query:
                    candidate_queries.append((aol_query_log.iloc[[index]]['AnonID'], aol_query)) 
    return candidate_queries

In [None]:
candidate_queries = identify_candidate_queries("gall bladder")

#### Ranking Candidates
##### 𝑆𝑐𝑜𝑟𝑒(𝐶𝑄, 𝑞′) = \# 𝑜𝑓 𝑠𝑒𝑠𝑠𝑖𝑜𝑛𝑠 𝑖𝑛 𝑤ℎ𝑖𝑐ℎ 𝑞 𝑖𝑠 𝑚𝑜𝑑𝑖𝑓𝑖𝑒𝑑 𝑡𝑜 𝐶𝑄 ÷ \# 𝑜𝑓 𝑠𝑒𝑠𝑠𝑖𝑜𝑛𝑠 𝑖𝑛 𝑤ℎ𝑖𝑐ℎ 𝑞 𝑎𝑝𝑝𝑒𝑎𝑟𝑠

In [None]:
aol_queries = aol_query_log['Query'].values

def rank_candidate_queries(original_query, candidates):
    num_sessions_q_modified = 0
    for query in candidates:
        # find session in query log from candidates list
        # check if it was changed to original_query
            # num_sessions_q_modified += 1 if it was
        print(query)
    return num_sessions_q_modified / len(candidates)

In [None]:
companies

In [None]:
aol_queries = aol_query_log['Query'].values

In [None]:
numSessionsQAppears = 0
for aol_query in aol_queries:
    if aol_query == 'cbc companies':
        numSessionsQAppears += 1

In [None]:
numSessionsQAppears

In [None]:
'gall bladder' == 'gall bladder surgery'