<a href="https://colab.research.google.com/github/5ury4pr454th/Semantic-Detection-with-GloVe/blob/main/Corpus_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Detecting semantically similar words using Stanford's GloVe

**NOTE!**: This Notebook is only for Corpus Extraction. Refer to the other notebook for training and execution.

In [None]:
import requests
import warnings

import wikipedia
from bs4 import BeautifulSoup
from wikipedia import DisambiguationError, PageError

import regex as re
import nltk
from nltk.corpus import stopwords

In [None]:
# after execution, restart runtime 
!pip install --upgrade wikipedia

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11696 sha256=c7cda48a82567e0a034a99ff6dfa0e5f9cc7d32cdd63c95f1e884a1a3f226e56
  Stored in directory: /root/.cache/pip/wheels/15/93/6d/5b2c68b8a64c7a7a04947b4ed6d89fb557dcc6bc27d1d7f3ba
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [None]:
# to get main Category
wikipedia.search("Computer Science")

['Computer science',
 'Computer graphics (computer science)',
 'Semantics (computer science)',
 'Glossary of computer science',
 'Computer science and engineering',
 'Heuristic (computer science)',
 'State (computer science)',
 'Scope (computer science)',
 'Record (computer science)',
 'Integer (computer science)']

In [None]:
# using BeautifulSoup, extract and clean content to get list of subcategories
topic_list = []

r = requests.get("https://en.wikipedia.org/wiki/Outline_of_computer_science")
soup = BeautifulSoup(r.content)

for link in soup.find_all('a'):
  if link.parent.name == 'li':
    if link.get('title')!=None:
      topic_list.append(link.get('title'))

for i in range(len(topic_list)):
  if ':' in topic_list[i]:
    topic_list = topic_list[:i]
    break
topic_list

In [None]:
# from each subcategory, get main text
warnings.filterwarnings("ignore")

document_collection = dict()

for _,i in enumerate(topic_list):
  try:  
    document = wikipedia.page(wikipedia.search(i)[0]).content
    document_collection[i] = document
  except (DisambiguationError, PageError):
    pass
  if _%10==0:
    print(f"{_} pages explored...")

print("Process Completed!")

0 pages explored...
10 pages explored...
20 pages explored...
30 pages explored...
40 pages explored...
50 pages explored...
60 pages explored...
70 pages explored...
80 pages explored...
90 pages explored...
100 pages explored...
110 pages explored...
120 pages explored...
130 pages explored...
Process Completed!


In [None]:
# checking for corpus length
total_number = 0
for i in document_collection.values():
  total_number += len(i)
total_number

3104054

In [None]:
# Stuff to do:
# Remove newline tags
# keep only alphabets
# convert all to lowercase
# remove all stopwords

# check for sample text
document_collection.values()



In [None]:
# remove numbers, remove \n, remove \' , remove ",.-: (exclusive signs)

for i in document_collection.keys():
  re.sub('[+-:;\']', '', document_collection.get(i))

In [None]:
# check if it works (don't remove '.' or '-', important for showing discontinuity and including words connected by - as a single word)
my_string = "the q.123ui\'ck br-own fox\n\n jumped o+ver th-e l:az;y dog"
new_string = re.sub('\n','', my_string)
re.sub('[^A-Za-z \.\-]','', new_string)

'the q.uick br-own fox jumped over th-e lazy dog'

In [None]:
# creating a copy and removing symbols
# document_collections is the copy of original, while document_collection is the edited one

document_collections = document_collection.copy()

document_collection = dict()

for i in document_collections.keys():
  raw_doc_string = re.sub('\n', '', document_collections.get(i))
  document_collection[i] = re.sub('[^A-Za-z \.\-]',' ', raw_doc_string)

In [None]:
# removes stopwords

nltk.download('stopwords')
stopword_list = stopwords.words('english')

for i in document_collection.keys():
  for j in stopword_list:
    document_collection[i] = re.sub(' '+j+' ', ' ', document_collection[i])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# combining documents
for i in document_collection.keys():
  list_of_words = re.sub('\.', ' ',document_collection[i].lower())
  document_collection[i] = list_of_words

document_collection

In [None]:
# if you want to see each of the entries.

# from collections import Counter

# counter_dict = dict()
# for i in document_collection.keys():
#   counter_dict[i] = Counter(document_collection[i])
# counter_dict

In [None]:
# # now, write a code to create windows (atleast three for each word) (not required as already implemented by glove)

# def string_co(cont_words, window_size):
#   co_list = []
#   for i in range(len(cont_words)):
#     child_list = []
#     for j in range(-window_size,window_size+1):
#       if (i+j) >= 0 and (i+j) < len(cont_words):
#         child_list.append(cont_words[i+j])
#       else:
#         child_list.append('#PAD#')

#     co_list.append(child_list)
#   return co_list

# my_string = "the quick brown fox jumped over the lazy dog"
# string_co(list(my_string.split(' ')), 2)
# # Not required

# ################################### TUNABLE PARAMETER WINDOW SIZE!!! ##########################################

# window_word_collection = dict()

# for i in document_collection.keys():
#   window_word_collection[i] = string_co(document_collection[i], 2)
  
# window_word_collection
# window_word_collection

In [None]:
# create corpus and save as txt
corpus_file = open("corpus.txt", "w+")
for i in document_collection.keys():
  corpus_file.write(document_collection[i])
  corpus_file.write("\n")
corpus_file.close()