In [None]:
import requests as r
from bs4 import BeautifulSoup
import re
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import sys
import pickle

In [None]:
# List of links to ignore
ignore = ["%","File","User","Special","Category","identifier","Identifier","Commons,","Help","Template","Protection","Wikipedia:","Portal","disambiguation"]

# Starting topics for JEE Advance Math topics
topics = ['https://en.wikipedia.org/wiki/Statistics',"https://en.wikipedia.org/wiki/Geometry","https://en.wikipedia.org/wiki/Trigonometry",'https://en.wikipedia.org/wiki/Algebra',"https://en.wikipedia.org/wiki/Calculus"]


In [None]:
# Used to extract the links from the url with some filters

def extract_links(url):
    global ignore
    """
    Extracts all links from a given url
    """
    ignore = ["%","File","User","Special","Category","identifier","Identifier","Commons,","Help","Template","Protection","Wikipedia:","Portal","disambiguation","Glossary","List"]
    page = r.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    links = soup.find_all('a',href=re.compile("^/wiki/"))
    links = [str(i.get('href')) for i in links]
    res = []
    for i in links:
        flag = 0
        for f in ignore:
            if (f in i or i in url):
                flag = 1
                break
        if (flag == 0):
            
            res.append(('https://en.wikipedia.org' + i).split('#')[0]) # Remove the hash from the link as it does not add anything

    return res

In [None]:
# This is used for extracting keywords for a particular page

from collections import Counter
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import requests
from bs4 import BeautifulSoup

def keywords(url):
    final_text = []
    res = requests.get(url)
    soup = BeautifulSoup(res.text, "html.parser")

    paragraph_tags = soup.find_all("p")

    for tag in paragraph_tags:
        try:
            list_words = tag.get_text().strip().split()
            for word in list_words:
                if "(" not in word and "[" not in word and "{" not in word:
                    final_text = final_text + [word]
        except:
            pass

    # removing stop words and finding root words
    stop = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    stop_text = [lemmatizer.lemmatize(word) for word in final_text if word.lower() not in stop]

    count = Counter(stop_text).most_common(12)
    final_list = [word for word, value in count]

    return list(set(final_list))

In [None]:
# For example to extract the keywords for the statistics page
print(keywords("https://en.wikipedia.org/wiki/Statistics"))

['study', 'sample', 'statistic', 'probability', 'method', 'use', 'hypothesis', 'population', 'value', 'used', 'data', 'statistical']


In [None]:
# This is the main algorithm for scraping pages, it takes in the starting topics and the number of pages to be scraped
def bfs(url,limit,d,debug = False):
    """
    Breadth-first search
    """
    to_visit = [url]
    visited = []
    G = nx.Graph()
    while len(visited) < limit:
        # Progress Percentage
        sys.stdout.write(f"\rNodes: {G.number_of_nodes()}, Visited: {len(visited) + 1} |")
        sys.stdout.flush()

        current = to_visit.pop(0)
        if current not in visited:
            content = get_content(current)
            d[current] = content
            links = extract_links(current)
            to_visit.extend(links[0:int(len(links)*0.2)])
            visited.append(current)
            G.add_node(current)
            for i in links:
                G.add_edge(current,i)
        if (debug):
            print(current)
            print('===========================================================================')
            print(content)
            print("\n************************************************************************\n")
    print()
    file_name = url.split('/')[-1]
    with open(f'{file_name}_dict.pkl','wb') as f:
        pickle.dump(d,f)
    with open(f'{file_name}_graph.pkl','wb') as f:
        pickle.dump(G,f)
    return G


In [None]:
# Import packages
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import nltk
words = set(nltk.corpus.words.words())


# this function basically gets the contents of the page and cleans them by removing unnecessary stuff like html tags, etc and escapes the special characters
# It also removes all the characters from non english languages
def get_content(url):

    # Specify url of the web page
    source = urlopen(url).read()

    # Make a soup 
    soup = BeautifulSoup(source,'lxml')

    # Extract the plain text content from paragraphs
    paras = []
    for paragraph in soup.find_all('p'):
        paras.append(str(paragraph.text))
    text = ''
    for i in paras:
        text += i.lower()

    text = re.sub(r"\[.*?\]+", ' ', text)
    text = re.sub(r"\{.*?\}+", ' ', text)
    text = re.sub(r"\(.*?\)+", ' ', text)
    text = re.sub(r"[0-9]+", ' ', text)
    text = re.sub(r"[Α-Ωα-ω]",' ',text)
    text = re.sub(r"\[A-Za-z0-9]+",' ',text)
    text = re.sub(r"[^\u0000-\u05C0\u2100-\u214F]+", ' ', text)
    text = re.sub(r"\\[a-zA-Z]+", ' ', text)
    text = re.sub("[^a-zA-Z'\n ]+", ' ', text)
    text = re.sub(r"\n", ' ', text)
    text = " ".join(w for w in nltk.wordpunct_tokenize(text) if w.lower() in words or not w.isalpha())

    return text

In [None]:
# Running BFS on the starting topics
d = {}
f = bfs('https://en.wikipedia.org/wiki/Statistics',1000,d)


Nodes: 86056, Visited: 1000


In [None]:
f = bfs('https://en.wikipedia.org/wiki/Trigonometry',1000,d)


In [None]:
d = {}
f = bfs('https://en.wikipedia.org/wiki/Algebra',1000,d)


Nodes: 147137, Visited: 1000


In [None]:
d = {}
f = bfs('https://en.wikipedia.org/wiki/Trigonometry',1000,d)


Nodes: 109783, Visited: 1000


In [None]:
d = {}
f = bfs('https://en.wikipedia.org/wiki/Geometry',1000,d)


Nodes: 72497, Visited: 1000


# NLP Feature Extraction on the scraped pages

In [None]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/chaitanya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
all_dict = {} # Contains all the url and their corresponding content

In [None]:
with open('Statistics_dict.pkl','rb') as f:
  stats = pickle.load(f)
  all_dict.update(stats)
with open('Algebra_dict.pkl','rb') as f:
  algebra = pickle.load(f)
  all_dict.update(algebra)
with open('Calculus_dict.pkl','rb') as f:
  stats = pickle.load(f)
  all_dict.update(stats)
with open('Geometry_dict.pkl','rb') as f:
  stats = pickle.load(f)
  all_dict.update(stats)
with open('Trigonometry_dict.pkl','rb') as f:
  stats = pickle.load(f)
  all_dict.update(stats)

In [None]:
topics_dataset = pd.DataFrame(all_dict.items(),columns=['Link','Content'])

In [None]:
# Applying stemming operation on all the content in the dictionary

port_stem = PorterStemmer()

def stemming(Content):
    # stemmed_content = re.sub('[^a-zA-Z]',' ',Content)
    # stemmed_content = stemmed_content.lower()
    stemmed_content = Content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content 
                       if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [None]:
# Running the stem function on all content in the dictionary
for i in range(3984):
  topics_dataset['Content'][i] = stemming(topics_dataset['Content'][i])
  sys.stdout.write(f"\rNumber of Articles completed: {i + 1}")
  sys.stdout.flush()


Number of Articles completed: 3984

In [None]:
# Pickling the stemmed content for later use
with open('dataframe.pkl','wb') as f:
  pickle.dump(topics_dataset,f)

In [None]:
# Open dataframe
with open('dataframe.pkl','rb') as f:
  topics_dataset = pickle.load(f)

In [None]:
topics_dataset.head()

Unnamed: 0,Link,Content
0,https://en.wikipedia.org/wiki/Statistics,statist disciplin collect organ analysi interp...
1,https://en.wikipedia.org/wiki/Outline_of_stati...,statist field inquiri collect analysi interpre...
2,https://en.wikipedia.org/wiki/Notation_in_prob...,probabl theori statist commonli use addit stan...
3,https://en.wikipedia.org/wiki/Normal_distribution,normal distribut probabl distribut use model p...
4,https://en.wikipedia.org/wiki/Probability_dens...,probabl theori probabl densiti function densit...


In [None]:
# print(topics_dataset['Content'])
X = topics_dataset['Content'].values

In [None]:
# Initializing the TFIDF vectorizer

vectorizer = TfidfVectorizer()

In [None]:
# Fitting the vectorizer on the content
vectorizer.fit(X)
X = vectorizer.transform(X)

In [None]:
print(X.shape)

(3984, 33526)


# Reducing the dimension of the feature vector

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# instantiate the vectorizer object
# use analyzer is word and stop_words is english which are responsible for remove stop words and create word vocabulary
tfidfvectorizer = TfidfVectorizer(analyzer='word' , stop_words='english',)
tfidfvectorizer.fit(X)
tfidf_term_vectors  = tfidfvectorizer.transform(X)
print("Sparse Matrix form of test data : \n")
H = tfidf_term_vectors.todense()
print(H)

Sparse Matrix form of test data : 

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [None]:
from scipy import spatial

result = 1 - spatial.distance.cosine(H[0], H[1])
print(result)

0.528824942805929


In [None]:

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer


import sys
from time import time

import numpy as np

In [None]:
true_k= 1000

r = true_k
t0 = time()
svd = TruncatedSVD(r)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
SVD_Y = lsa.fit_transform(H)
print("done in %fs" % (time() - t0))



done in 124.872466s


In [None]:
print(SVD_Y)

[[ 3.79153645e-01 -1.54965102e-01  6.40043413e-01 ... -2.17995942e-03
  -2.69175213e-03  2.22546940e-03]
 [ 2.07989094e-01 -4.10318776e-02  3.45999297e-01 ... -2.09904371e-02
  -5.86431279e-03 -1.16527462e-02]
 [ 2.40369735e-01 -1.74622356e-01  3.00602041e-01 ... -1.88630309e-02
  -2.90126714e-03  1.06300153e-04]
 ...
 [ 5.81045198e-01  4.65712212e-01 -9.05462094e-02 ...  2.24885241e-03
   1.91776991e-02  7.43007054e-03]
 [ 1.83982341e-01  1.48879790e-01 -5.33076716e-02 ... -5.14645717e-03
   1.16747696e-02  9.37588025e-03]
 [ 1.83982341e-01  1.48879790e-01 -5.33076716e-02 ... -5.14645717e-03
   1.16747696e-02  9.37588025e-03]]


In [None]:
# Building a new graph on the basis of how similar two feature vectors are

# If the cosine similarity between two feature vectors > 0.5 then we draw an edge between them

from scipy import spatial
import networkx as nx
g = nx.Graph()
for t in range(0,3984):
    g.add_node(t)


k = 0
for i in range(0,3984):
    for j in range(0,3984): # put i != j logic
        k += 1
        result = 1 - spatial.distance.cosine(SVD_Y[i], SVD_Y[j])
        #print(result)
        if (result>0.5):
            g.add_edge(i, j)
            print('###')
        if (k%3000 == 0):
          sys.stdout.flush()
          sys.stdout.write(f"\rNumber of Articles completed: {k}, {(k/(3984*3984))*100:.2f}%")
          sys.stdout.flush()
        
            
# nx.draw(g, with_labels = True)
# nx.draw(g)

 




In [None]:
import pickle
import networkx as nx
with open('graph.pkl','rb') as f:
  g = pickle.load(f)

# Centrality Metrics

In [None]:
g.number_of_edges()

24895

In [None]:
print(nx.clustering(g))

{0: 0.3023255813953488, 1: 0.8666666666666667, 2: 1.0, 3: 0.5380952380952381, 4: 0.6267806267806267, 5: 0.6041666666666666, 6: 1.0, 7: 1.0, 8: 0.6856330014224751, 9: 0.7777777777777778, 10: 0.3306878306878307, 11: 0.25, 12: 0.5555555555555556, 13: 0.5256410256410257, 14: 1.0, 15: 0.8181818181818182, 16: 0.45454545454545453, 17: 0.26666666666666666, 18: 0.6572199730094467, 19: 0.6029411764705882, 20: 0, 21: 0.5606060606060606, 22: 0.2, 23: 0.675, 24: 0, 25: 0, 26: 0.40942028985507245, 27: 0.4672268907563025, 28: 0.45789473684210524, 29: 0.6666666666666666, 30: 0.5934065934065934, 31: 0.9285714285714286, 32: 0.9285714285714286, 33: 0.43636363636363634, 34: 0.9565217391304348, 35: 0, 36: 0.44126984126984126, 37: 0.5, 38: 0, 39: 0.7492063492063492, 40: 0.45454545454545453, 41: 0.675, 42: 0.45789473684210524, 43: 0.43333333333333335, 44: 0.8214285714285714, 45: 0.8214285714285714, 46: 0.4065934065934066, 47: 0.3333333333333333, 48: 0.3333333333333333, 49: 0, 50: 0, 51: 1.0, 52: 1.0, 53: 0.5

In [None]:
print(nx.average_clustering(g))

0.4842323536989113


In [None]:
print(nx.modularity_matrix(g))

[[ 9.57734795e-01  9.89433699e-01 -7.68458281e-03 ... -7.68458281e-02
  -1.92114570e-03 -1.92114570e-03]
 [ 9.89433699e-01  9.97358425e-01 -1.92114570e-03 ... -1.92114570e-02
  -4.80286425e-04 -4.80286425e-04]
 [-7.68458281e-03 -1.92114570e-03  9.98602803e-01 ... -1.39719687e-02
  -3.49299218e-04 -3.49299218e-04]
 ...
 [-7.68458281e-02 -1.92114570e-02 -1.39719687e-02 ...  8.60280313e-01
  -3.49299218e-03 -3.49299218e-03]
 [-1.92114570e-03 -4.80286425e-04 -3.49299218e-04 ... -3.49299218e-03
   9.99912675e-01  9.99912675e-01]
 [-1.92114570e-03 -4.80286425e-04 -3.49299218e-04 ... -3.49299218e-03
   9.99912675e-01  9.99912675e-01]]


In [None]:
print(nx.closeness_centrality(g))

{0: 0.12544982843752614, 1: 0.11773316188940473, 2: 0.10903515239342, 3: 0.12205049023388072, 4: 0.11849979068078667, 5: 0.11513809449126079, 6: 0.0007532011046949536, 7: 0.0010042681395932714, 8: 0.11841895014348343, 9: 0.10918815086535154, 10: 0.12279433606868478, 11: 0.10261432012797148, 12: 0.10659383627464432, 13: 0.10691706215930163, 14: 0.10845347786486376, 15: 0.11178992935372141, 16: 0.10835453346462214, 17: 0.10153849504743612, 18: 0.1183940983176401, 19: 0.11939005946427281, 20: 0.0005706068974961769, 21: 0.09856693954460646, 22: 0.11367073916654344, 23: 0.12150503521075923, 24: 0.09491846193043746, 25: 0.0, 26: 0.1241181185817183, 27: 0.12149849318549617, 28: 0.11944061366295561, 29: 0.1090509599349645, 30: 0.10905623013407212, 31: 0.10495286330562394, 32: 0.10495286330562394, 33: 0.0973382010065229, 34: 0.11365928850278133, 35: 0.0, 36: 0.13926138693743645, 37: 0.12390684789886999, 38: 0.0, 39: 0.11583550710611469, 40: 0.10835453346462214, 41: 0.12150503521075923, 42: 0.11