In [1]:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup

from itertools import combinations   # for combinations

import pandas as pd                   # to import
import numpy as np                    # for unique()
from nltk.stem import PorterStemmer   # for stemming   

 ##### Creating cleaning function to clean words from any non-alphabetical signs

In [2]:
def cleaning(word):

# here each word is being cleaned by any included non-alphabetical characters, 
# like in a case: car?, Holidays!, planets:, 'yummy', sweet, yucky. ...        
# Besides that it excludes any non-alphabetical typos
        
    clean_word = ''        

    for letter in word: 
# here i am removing any non-alphabetical characters from each word        
        if letter.isalpha():
            clean_word += letter
        else:
            next
        
    return clean_word

##### Importing stop words in english. We can also use built-in stop words in NLP package, but its magnitude is not as wide as those imported files

In [3]:
# Collecting all stopwords at hand together

stopwords1 = pd.read_csv('stop-words_english_1_en.txt', header=None)
stopwords2 = pd.read_csv('stop-words_english_2_en.txt', header=None)
stopwords3 = pd.read_csv('stop-words_english_3_en.txt', header=None)
stopwords4 = pd.read_csv('stop-words_english_4_google_en.txt', header=None)
stopwords5 = pd.read_csv('stop-words_english_5_en.txt', header=None)
stopwords6 = pd.read_csv('stop-words_english_6_en.txt', header=None)

stopwords = pd.concat([stopwords1, stopwords2, stopwords3, stopwords4, stopwords5, stopwords6], axis=0)

stopwords.index = [list(range(1, len(stopwords)+1))]
stopwords.columns = ['word']

# Web Scrapping part

### **Set the URL you want to webscrape from**

In [4]:
urls = ['https://arxiv.org/abs/2007.08978', 'https://arxiv.org/abs/2006.03389', 'https://arxiv.org/abs/2006.16964',
        'https://arxiv.org/abs/2007.04095', 'https://arxiv.org/abs/2008.00315', 'https://arxiv.org/abs/1901.08151',
       'https://arxiv.org/abs/1901.10555', 'https://arxiv.org/abs/1511.03085', 'https://arxiv.org/abs/1905.03061',
       'https://arxiv.org/abs/2002.01759']

##### Extracting the titles and the abstracts

In [5]:
abstracts = []
titles = []

for each_link in urls:

    # Connect to the URL
    response = requests.get(each_link)
    # Parse HTML and save to BeautifulSoup object
    soup = BeautifulSoup(response.text, "html.parser")

    # to extract specific part: where the Abstract is located
    abstract = soup.findAll('blockquote')
    # compiling all abstracts into a list
    abstracts += abstract

    # to extract titles
    title = soup.find('title')
    titles += title

##### Let's us see the extracted titles to see if need further work to clean them 

In [6]:
print(titles)

['[2007.08978] A large-scale comparative analysis of Coding Standard conformance in Open-Source Data Science projects', '[2006.03389] Computability and Non-monotone induction', '[2006.16964] Data Science: Nature and Pitfalls', '[2007.04095] Data science and the art of modelling', '[2008.00315] A fresh look at introductory data science', '[1901.08151] Cloud BI: Future of Business Intelligence in the Cloud', '[1901.10555] The Effects of Using Business Intelligence Systems on an Excellence Management and Decision-Making Process by Start-Up Companies: A Case Study', '[1511.03085] Big Data and Business Intelligence: Debunking the Myths', '[1905.03061] Generalized formal model of big data', '[2002.01759] Quality Assurance Technologies of Big Data Applications: A Systematic Literature Review']


In [7]:
for each_title in range(len(titles)):
    # splitting all word junks and exluding the first chunk
    texts = titles[each_title].split()[1:]
    # rejoining the words of titles
    titles[each_title] = ' '.join(texts)

In [8]:
print(titles)

['A large-scale comparative analysis of Coding Standard conformance in Open-Source Data Science projects', 'Computability and Non-monotone induction', 'Data Science: Nature and Pitfalls', 'Data science and the art of modelling', 'A fresh look at introductory data science', 'Cloud BI: Future of Business Intelligence in the Cloud', 'The Effects of Using Business Intelligence Systems on an Excellence Management and Decision-Making Process by Start-Up Companies: A Case Study', 'Big Data and Business Intelligence: Debunking the Myths', 'Generalized formal model of big data', 'Quality Assurance Technologies of Big Data Applications: A Systematic Literature Review']


##### Now let's go on to the same procedures but with the extracted abstracts

In [9]:
print(abstracts)

[<blockquote class="abstract mathjax">
<span class="descriptor">Abstract:</span>  Background: Meeting the growing industry demand for Data Science requires
cross-disciplinary teams that can translate machine learning research into
production-ready code. Software engineering teams value adherence to coding
standards as an indication of code readability, maintainability, and developer
expertise. However, there are no large-scale empirical studies of coding
standards focused specifically on Data Science projects. Aims: This study
investigates the extent to which Data Science projects follow code standards.
In particular, which standards are followed, which are ignored, and how does
this differ to traditional software projects? Method: We compare a corpus of
1048 Open-Source Data Science projects to a reference group of 1099 non-Data
Science projects with a similar level of quality and maturity. Results: Data
Science projects suffer from a significantly higher rate of functions that use
an

In [10]:
for each in range(len(abstracts)):
    # specifically extracting the Abstracts
    abstracts[each] = abstracts[each].contents[2]
    # stripping any spaces, tabs, or empty lines
    abstracts[each] = abstracts[each].strip()

In [11]:
abstracts[0]

'Background: Meeting the growing industry demand for Data Science requires\ncross-disciplinary teams that can translate machine learning research into\nproduction-ready code. Software engineering teams value adherence to coding\nstandards as an indication of code readability, maintainability, and developer\nexpertise. However, there are no large-scale empirical studies of coding\nstandards focused specifically on Data Science projects. Aims: This study\ninvestigates the extent to which Data Science projects follow code standards.\nIn particular, which standards are followed, which are ignored, and how does\nthis differ to traditional software projects? Method: We compare a corpus of\n1048 Open-Source Data Science projects to a reference group of 1099 non-Data\nScience projects with a similar level of quality and maturity. Results: Data\nScience projects suffer from a significantly higher rate of functions that use\nan excessive numbers of parameters and local variables. Data Science pr

##### Now we extract each word in abstracts that are not stopwords

In [12]:
new_abstracts = []

for each_abstract in range(len(abstracts)):
    
    temp_list = []
    
    for each_word in abstracts[each_abstract].split():
        # cleaning words from any non-alphabetical signs
        clean_word = cleaning(each_word).lower()
        
        # excluding words if they are in stopwords
        if clean_word in list(stopwords['word']):
            next
        elif clean_word == '':
            next
        # collecting filtered out new words
        else:
            # doing stemming on each word
            temp_list.append(clean_word)

    # excluding out duplicate values
    temp_list = list(np.unique(temp_list))
    
    # re-compile all abstracts under a new list, new_abstract
    new_abstracts.append(temp_list)    

In [13]:
' '.join(new_abstracts[0])

'adherence aims background code codebases coding compare conclusions conjecture context conventions corpus crossdisciplinary data demand developer differ differences distinct empirical engineering excessive expertise extent focused follow functions group growing higher inappropriate indication industry investigates largescale learning level local machine maintainability maturity meeting method naming nondata numbers opensource parameters productionready projects quality rate readability reference requires science software standards studies study suffer teams traditional translate variable variables'

##### Now we can run on Jaccard similarities and Cosine similarities on the cleaned word of each abstracts

# Finding Similarities part & Recommendation system

## Jaccard Similarities

In [14]:
from scipy.spatial.distance import jaccard
from sklearn.metrics import jaccard_score

In [15]:
def Jaccard_Similarity(doc1, doc2): 
    
    # List the unique words in a document
    words_doc1 = set(doc1) 
    words_doc2 = set(doc2)
    
    # Find the intersection of words list of doc1 & doc2
    intersection = words_doc1.intersection(words_doc2)

    # Find the union of words list of doc1 & doc2
    union = words_doc1.union(words_doc2)
        
    # Calculate Jaccard similarity score 
    # using length of intersection set divided by length of union set
    return float(len(intersection)) / len(union)

In [16]:
# creating all possibilities of getting pairs on Papers to run

possible_pairs = list(combinations(range(len(new_abstracts)), 2))
#possible_pairs

In [17]:
# Calculating similarities

similarities = []

for each in possible_pairs:
    similarities.append(list([each[0], each[1], 
                              Jaccard_Similarity(new_abstracts[each[0]], new_abstracts[each[1]])]))
    
# similarities

In [18]:
similarity_table = pd.DataFrame(similarities, 
                                columns = ['Item1', 'Item2', 'Similarities']).sort_values('Similarities', ascending=False)
similarity_table.index = [list(range(1, len(similarities)+1))] 

similarity_table.head()

Unnamed: 0,Item1,Item2,Similarities
1,4,9,0.059603
2,6,8,0.059406
3,2,3,0.058824
4,2,7,0.057971
5,2,4,0.052083


##### So, according to the Jaccard similarity algorithm the above table shows similar papers based on the words in their abstracts

## Cosine Similarities

In [19]:
def cosine_similarity(x,y):

    l1 =[]
    l2 =[]

    X_set = set(x)
    Y_set = set(y)

    # form a set containing keywords of both strings 
    rvector = X_set.union(Y_set) 
    for w in rvector:
        if w in X_set: l1.append(1) # create a vector
        else: l1.append(0)
        if w in Y_set: l2.append(1)
        else: l2.append(0)
    c = 0
  
    # cosine formula 
    for i in range(len(rvector)):
        c+= l1[i]*l2[i]
    cosine = c / float((sum(l1)*sum(l2))**0.5)
    return cosine

In [20]:
# Calculating similarities

similarities = []

for each in possible_pairs:
    similarities.append(list([each[0], each[1], cosine_similarity(new_abstracts[each[0]], new_abstracts[each[1]])]))
    
# similarities

In [21]:
similarity_table = pd.DataFrame(similarities, 
                                columns = ['Item1', 'Item2', 'Similarities']).sort_values('Similarities', ascending=False)
similarity_table.index = [list(range(1, len(similarities)+1))] 

similarity_table.head()

Unnamed: 0,Item1,Item2,Similarities
1,2,4,0.121988
2,2,7,0.121046
3,2,3,0.113961
4,4,9,0.1125
5,6,8,0.112154


##### The above code finds all similar papers based on the abstracts of each paper using Cosine similarity

##### If we pay attention to the both above tables, even though the calculated values for similarities differ from one algorithm to another, the overall output is somehow similar. Both algorithms arranged similar papers differently, but the first 5 output are in general the same

One can choose the algorithm based on the given task and preferences.

**'Jaccard similarity is good for cases where duplication does not matter, cosine similarity is good for cases where duplication matters while analyzing text similarity. For two product descriptions, it will be better to use Jaccard similarity as repetition of a word does not reduce their similarity'.** Source: https://towardsdatascience.com/overview-of-text-similarity-metrics-3397c4601f50