Task 1 : Web Crawling

In [None]:
import requests
from bs4 import BeautifulSoup
import csv

#getting URL's of 6 pages in 1 array
url1=[]
for i in range(1,6):
  page = f'https://pureportal.coventry.ac.uk/en/organisations/research-centre-for-computational-science-and-mathematical-modell/publications/?page={i}'
  url1.append(page)


publications = []
queue = url1
while queue!=[]:
    url = queue.pop(0)
    response = requests.get(url)

    soup = BeautifulSoup(response.content, 'html.parser')
    for publication in soup.findAll('div', class_='result-container'):
        title = publication.find('h3', class_='title').text.strip()

        publication_link = publication.find('a').get('href')
        authors = [author.text.strip() for author in publication.find_all('a', class_='link person')]
        year = publication.find('span', class_='date').text.strip()
        try:
            author_profile_link = publication.find('a', class_='link person').get('href')
        except Exception:
            pass
        if authors!=[]:
            publications.append({'title':title,'link': publication_link, 'authors': authors, 'year': year,'author_profile_link':author_profile_link})

# creating a csv file to put the data
with open('publication_information.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['title', 'link', 'authors', 'year','author_profile_link']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for publication in publications:
        writer.writerow(publication)

In [None]:
import pandas as pd
df=pd.read_csv('publication_information.csv')
df.head(5)

Unnamed: 0,title,link,authors,year,author_profile_link
0,Levelwise construction of a single cylindrical...,https://pureportal.coventry.ac.uk/en/publicati...,"['England, M.']",25 Nov 2022,https://pureportal.coventry.ac.uk/en/persons/m...
1,Leveraging Arabic sentiment classification usi...,https://pureportal.coventry.ac.uk/en/publicati...,"['Palade, V.']",Nov 2022,https://pureportal.coventry.ac.uk/en/persons/v...
2,LIFT: lncRNA identification and function-predi...,https://pureportal.coventry.ac.uk/en/publicati...,"['Shuttleworth, J.', 'England, M.']",17 Jan 2022,https://pureportal.coventry.ac.uk/en/persons/j...
3,Machine Learning for Computer Algebra,https://pureportal.coventry.ac.uk/en/publicati...,"['Barket, R.', 'del Río, T.', 'England, M.']",2022,https://pureportal.coventry.ac.uk/en/persons/r...
4,Markov Chain Monte Carlo-Based Estimation of S...,https://pureportal.coventry.ac.uk/en/publicati...,"['Daneshkhah, A.']",2022,https://pureportal.coventry.ac.uk/en/persons/a...


In [None]:
#INVERTED INDEX

import pandas as pd
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
# Define the schema for the inverted index
inverted_index = {}


stop_words = set(stopwords.words('english'))

# Define the stemmer to be used
ps = PorterStemmer()

for i, row in df.iterrows():
    # Tokenizing the title, remove stop words, and stem the words
    title_tokens = [ps.stem(word.lower()) for word in word_tokenize(row['title']) if not word.lower() in stop_words]
    # Add the publication to the inverted index for each word in the title
    for word in title_tokens:
        if not word in inverted_index:
            inverted_index[word] = []
        inverted_index[word].append({'title': row['title'], 'link': row['link'],
                                     'authors': row['authors'], 'year': row['year'],
                                     'author_profile_link': row['author_profile_link']})

# Write the inverted index to a CSV file
with open('inverted_index.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['word', 'publications']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for word in inverted_index:
        writer.writerow({'word': word, 'publications': inverted_index[word]})

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df1 = pd.read_csv('inverted_index.csv')
df1

Unnamed: 0,word,publications
0,levelwis,[{'title': 'Levelwise construction of a single...
1,construct,[{'title': 'Levelwise construction of a single...
2,singl,[{'title': 'Levelwise construction of a single...
3,cylindr,[{'title': 'Levelwise construction of a single...
4,algebra,[{'title': 'Levelwise construction of a single...
...,...,...
780,logic,[{'title': 'Adding Logical Operators to Tree P...
781,tree,[{'title': 'Adding Logical Operators to Tree P...
782,pattern,[{'title': 'Adding Logical Operators to Tree P...
783,queri,[{'title': 'Adding Logical Operators to Tree P...


In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
#QUERY PROCESSING

import pandas as pd
import math
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


# Read the inverted index from the CSV file
inverted_index_df = pd.read_csv('inverted_index.csv')

# Convert the 'publications' column to a list of dictionaries
inverted_index_df['publications'] = inverted_index_df['publications'].apply(eval)

# Convert the DataFrame to a dictionary of lists of dictionaries
inverted_index = {}
for index, row in inverted_index_df.iterrows():
    inverted_index[row['word']] = row['publications']

# Define the stemmer to be used
ps = PorterStemmer()

# Define a function to tokenize a string and remove stop words
def tokenize_and_remove_stopwords(text):
    # Convert to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if not token in stop_words]
    # Stem the words
    tokens = [ps.stem(token) for token in tokens]
    return tokens

def search_index(query):
    query_tokens = tokenize_and_remove_stopwords(query)

    relevant_docs = set()
    for token in query_tokens:
        if token in inverted_index:
            for doc in inverted_index[token]:
                relevant_docs.add(doc['title'])

    print("query_tokens:", query_tokens)
    #print("relevant_docs:", relevant_docs)
    #print("inverted_index:", inverted_index)


    scores = {}
    for doc_title in relevant_docs:
        doc_tokens = tokenize_and_remove_stopwords(doc_title)
        print(doc_title)
        doc_length = len(doc_tokens)
        print(doc_length)
        score = 0
        for token in query_tokens:
            tf = doc_tokens.count(token) / doc_length
            idf = math.log(len(inverted_index) / len(inverted_index[token]))
            score += tf * idf*100
        scores[doc_title] = score
        print(scores)

    sorted_docs = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return sorted_docs



In [None]:
'''
User inputed queries :
  Machine Learning algorithm
  household choice of fuel in states
  Centralised and decentralised sensor
  what is SSF?
'''
requested_query = "what is SSF?"
results= search_index(requested_query)

for title, score in results:
    if(score>70):
      print("\nThe most relevant document related to query is :")
      print("\nTitle:",title,"\nSCORE:",score)


query_tokens: ['ssf', '?']
Among the last ones to leave? Understanding the Journeys of Muslim Children in the Care System in England
12
{'Among the last ones to leave? Understanding the Journeys of Muslim Children in the Care System in England': 43.99491130552098}
‘Come, Follow Me’, The Sacralising of the Home, and The Guardian of the Family: How Do European Women Negotiate the Domestic Space in the Church of Jesus Christ of Latter-Day Saints?
23
{'Among the last ones to leave? Understanding the Journeys of Muslim Children in the Care System in England': 43.99491130552098, '‘Come, Follow Me’, The Sacralising of the Home, and The Guardian of the Family: How Do European Women Negotiate the Domestic Space in the Church of Jesus Christ of Latter-Day Saints?': 22.953866768097903}
Evaluating assumptions of scales for subjective assessment of thermal environments – Do laypersons perceive them the way, we researchers believe?
15
{'Among the last ones to leave? Understanding the Journeys of Mus