TASK 1

In [1]:
pip install robotexclusionrulesparser

Collecting robotexclusionrulesparser
  Downloading robotexclusionrulesparser-1.7.1.tar.gz (31 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: robotexclusionrulesparser
  Building wheel for robotexclusionrulesparser (setup.py) ... [?25l[?25hdone
  Created wheel for robotexclusionrulesparser: filename=robotexclusionrulesparser-1.7.1-py3-none-any.whl size=12056 sha256=04feca7ef1f8137f604c670eb99beceb9941cc4332400d5bcfa5cb913c9405c3
  Stored in directory: /root/.cache/pip/wheels/2a/d6/38/051f91ac3af7f533633f694e6fe3c0de6cb0d493c3fb1d605a
Successfully built robotexclusionrulesparser
Installing collected packages: robotexclusionrulesparser
Successfully installed robotexclusionrulesparser-1.7.1


In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from robotexclusionrulesparser import RobotExclusionRulesParser
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string
import re
import pickle
import time
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
def fetch_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

In [4]:
# Beautiful soup function intilisization
def create_soup(html):
    return BeautifulSoup(html, 'html.parser')

In [5]:
# Polite parsing with robots.txt
def is_allowed(url, user_agent="*"):
    try:
        robots_url = urljoin(url, '/robots.txt')
        response = requests.get(robots_url)
        response.raise_for_status()
        rules = response.text

        rp = RobotExclusionRulesParser() # parser object
        rp.parse(rules)

        return rp.is_allowed(user_agent, url)
    except Exception as e:
        print(f"Error checking for robots.txt: {e}")
        return False

In [6]:
# Crawlinng and extravting relevant info
def crawl_extract(soup):
    list_results = soup.find('ul', class_='list-results')
    if list_results:
        list_elements = list_results.find_all("li", class_=lambda x: x.startswith('list-result-item list-result-item')) #list of all the publications on the webpage
        publications = []
        for list_element in list_elements:
            title_element = list_element.find(class_="title") # title of the publication
            title = title_element.text.strip()
            linkjournal = list_element.find('a')
            link = linkjournal['href']
            listperson = list_element.findAll('a', class_='link person')
            authors = []
            profiles = []
            for listper in listperson:
                author = listper.string
                authprofile = listper.get('href')
                authors.append(author)
                profiles.append(authprofile)

            pub_date_element = list_element.find('span', class_='date')
            pub_date = pub_date_element.text.strip()

            publications.append({'title': title, 'link': link, 'authors': authors, 'profiles': profiles, 'publication_date': pub_date})

        return publications
    else:
        return []

In [7]:
# Crawling all the web page with pagination and return publications
def crawl_pages(start_url, max_pages=10):
    if not is_allowed(start_url):
        print(f"Crawling is not allowed for: {start_url}")
        return None

    list_publications = []
    page = 0
    while page <= max_pages:
        url = f"{start_url}?page={page}"
        html = fetch_page(url)
        if not html:
            break
        soup = create_soup(html)

        publications = crawl_extract(soup)
        list_publications.extend(publications)

        # Check for the next page of the website
        next_page = soup.find('a', class_='next')
        if not next_page:
            break
        page += 1
        time.sleep(5)

    return list_publications

if __name__ == "__main__":

    start_url = "https://pureportal.coventry.ac.uk/en/organisations/centre-global-learning/publications/" #Url to start the crawling for CGL
    list_publications = crawl_pages(start_url, max_pages=10)
    df = pd.DataFrame(list_publications)

In [8]:
df.head()

Unnamed: 0,title,link,authors,profiles,publication_date
0,A revisit to the role of gender in moderating ...,https://pureportal.coventry.ac.uk/en/publicati...,"[Ayoubi, R., Crawford, M.]",[https://pureportal.coventry.ac.uk/en/persons/...,21 May 2023
1,Becoming Nigerian,https://pureportal.coventry.ac.uk/en/publicati...,"[Johnson, E., Ezeonyeka, G.]",[https://pureportal.coventry.ac.uk/en/persons/...,25 Mar 2023
2,Between sameness and difference: challenges fo...,https://pureportal.coventry.ac.uk/en/publicati...,"[Dang, Q., Morini, L.]",[https://pureportal.coventry.ac.uk/en/persons/...,11 Mar 2023
3,Coming out of the shadows: Investing in Englis...,https://pureportal.coventry.ac.uk/en/publicati...,"[Karakus, M.]",[https://pureportal.coventry.ac.uk/en/persons/...,29 May 2023
4,Foreword,https://pureportal.coventry.ac.uk/en/publicati...,"[Orsini-Jones, M.]",[https://pureportal.coventry.ac.uk/en/persons/...,1 May 2023


In [9]:
df['title'] = df['title'].str.lower()
df['authors'] = df['authors'].apply(lambda x: [author.lower() for author in x])

In [12]:
# Preprocess text
def clean_text(text):
    text = text.lower()
    words = word_tokenize(text)
    table = str.maketrans('', '', string.punctuation) # Removing punctuation
    words = [word.translate(table) for word in words if word.isalnum()]
    stop_words = set(stopwords.words('english')) # Stop words removal
    words = [word for word in words if word not in stop_words]
    stemmer = PorterStemmer() # stemming to get the room word
    words = [stemmer.stem(word) for word in words]
    cleaned_text = ' '.join(words)
    return cleaned_text

In [13]:
inverted_index = {}
# Function to add document index
def add_doc_to_inverted_index(term, doc_index):
    if term not in inverted_index:
        inverted_index[term] = []
    if doc_index not in inverted_index[term]:
        inverted_index[term].append(doc_index)

# Index by title
for idx, row in df.iterrows():
    title_text = row['title']
    cleaned_title = clean_text(title_text)
    words = re.findall(r'\w+', cleaned_title)
    # title with index
    for word in words:
        add_doc_to_inverted_index(word, idx)

# Index by authors
for idx, row in df.iterrows():
    authors_list = row['authors']
    preprocessed_authors = [clean_text(author) for author in authors_list]
    for author_text in preprocessed_authors:
        words = re.findall(r'\w+', author_text)
        # author with index
        for word in words:
            add_doc_to_inverted_index(word, idx)


In [14]:
# Process user Query
def user_query(query):
    preprocessed_query = clean_text(query)
    query_words = re.findall(r'\w+', preprocessed_query)

    # Find a match in documents
    match_documents = None
    for word in query_words:
        if word in inverted_index:
            doc_index = inverted_index[word]
            if match_documents is None:
                match_documents = set(doc_index)
            else:
                match_documents.intersection_update(doc_index)

    if match_documents:
        matching_publications = df.iloc[list(match_documents)]
    else:
        matching_publications = pd.DataFrame()

    return matching_publications

In [15]:
# Searcg User Input Query
query = input("Enter search query: ")

# Process the query and retrieve matching documents
matching_publications = user_query(query)

if not matching_publications.empty:
    print("Matched Publications:")
    print(matching_publications)
else:
    print("No publications found.")

Enter search query: Karakus
Matched Publications:
                                                title  \
3   coming out of the shadows: investing in englis...   
9   measuring student well-being in adolescence: p...   
11  obituary for zoltán dörnyei (1960–2022): a bib...   
16  the role of teacher selection criteria and pre...   
17  throwing light on fee-charging tutoring during...   
18  transformational school leadership: a systemat...   
19  understanding the academic achievement of the ...   
22  a bibliometric mapping of shadow education res...   

                                                 link        authors  \
3   https://pureportal.coventry.ac.uk/en/publicati...  [karakus, m.]   
9   https://pureportal.coventry.ac.uk/en/publicati...  [karakus, m.]   
11  https://pureportal.coventry.ac.uk/en/publicati...  [karakus, m.]   
16  https://pureportal.coventry.ac.uk/en/publicati...  [karakus, m.]   
17  https://pureportal.coventry.ac.uk/en/publicati...  [karakus, m.]   
18  