# Content Relevance Analysis

In [1]:
import os
import numpy as np
import pandas as pd
from statsmodels.stats.proportion import proportion_confint

## How much more documents are gathered from the second scraper version?

The second version of the web scraper extends the range of the scraper with an increased maximum depth of 3 recursively followed links (versus 2 in the first version) and with an additional start request (starting from "https://www.uni-mannheim.de/dws/"). This version gathers a lot more documents (HTMLs and PDFs).

In [2]:
v1_docs_list = os.listdir("data/scraped_data/")
v1_htmls = [d for d in v1_docs_list if d.split(".")[-1] == "html"]
v1_pdfs = [d for d in v1_docs_list if d.split(".")[-1] == "pdf"]
n_v1_docs_total = len(v1_docs_list)
n_v1_htmls = len(v1_htmls)
n_v1_pdfs = len(v1_pdfs)

v2_docs_list = os.listdir("data/scraped_data_v2/")
v2_htmls = [d for d in v2_docs_list if d.split(".")[-1] == "html"]
v2_pdfs = [d for d in v2_docs_list if d.split(".")[-1] == "pdf"]
n_v2_docs_total = len(v2_docs_list)
n_v2_htmls = len(v2_htmls)
n_v2_pdfs = len(v2_pdfs)

doc_counts = pd.DataFrame(
    {
        "n_docs_total": [n_v1_docs_total, n_v2_docs_total],
        "n_HTMLs": [n_v1_htmls, n_v2_htmls],
        "n_PDFs": [n_v1_pdfs, n_v2_pdfs]
    },
    index=["version 1", "version 2"]
)

display(doc_counts)

increase_factors_n = doc_counts.loc["version 2"] / doc_counts.loc["version 1"]

print("increase factors:")
display(increase_factors_n)

Unnamed: 0,n_docs_total,n_HTMLs,n_PDFs
version 1,272,111,161
version 2,1652,668,983


increase factors:


n_docs_total    6.073529
n_HTMLs         6.018018
n_PDFs          6.105590
dtype: float64

There are roughly 6 times more documents gathered from the second scraper version.

In [3]:
# calculate summed up file sizes for documents scraped by both scraper versions

def get_size_mb(docs_list, path_base):
    total_size_bytes = 0
    for doc in docs_list:
        size = os.path.getsize(path_base + doc)
        total_size_bytes += size
    total_size_mb = total_size_bytes / 1000**2
    return total_size_mb

path_base_v1 = "data/scraped_data/"
s_v1_docs_total = get_size_mb(v1_docs_list, path_base=path_base_v1)
s_v1_htmls = get_size_mb(v1_htmls, path_base=path_base_v1)
s_v1_pdfs = get_size_mb(v1_pdfs, path_base=path_base_v1)

path_base_v2 = "data/scraped_data_v2/"
s_v2_docs_total = get_size_mb(v2_docs_list, path_base=path_base_v2)
s_v2_htmls = get_size_mb(v2_htmls, path_base=path_base_v2)
s_v2_pdfs = get_size_mb(v2_pdfs, path_base=path_base_v2)

doc_sizes = pd.DataFrame(
    {
        "mb_docs_total": [s_v1_docs_total, s_v2_docs_total],
        "mb_HTMLs": [s_v1_htmls, s_v2_htmls],
        "mb_PDFs": [s_v1_pdfs, s_v2_pdfs]
    },
    index=["version 1", "version 2"]
)

display(doc_sizes)

increase_factors_s = doc_sizes.loc["version 2"] / doc_sizes.loc["version 1"]

print("increase factors:")
display(increase_factors_s)

Unnamed: 0,mb_docs_total,mb_HTMLs,mb_PDFs
version 1,71.063682,9.025052,62.03863
version 2,1276.537296,42.757508,1233.778471


increase factors:


mb_docs_total    17.963287
mb_HTMLs          4.737647
mb_PDFs          19.887262
dtype: float64

The summed up file sizes in the second version are substantially larger than in the first version. Interestingly, while the increase factor for the number of HTMLs is equal to the increase factor of the number of PDFs, the increase factor for the summed up file sizes is higher for PDFs than for HTMLs.

## How much irrelevant documents are collected?

The scraper does not explicitly filter irrelevant documents. Irrelevant documents might lower the overall performance of the chatbot. This can happen by wasting time on processing irrelevant documents (e.g. generation of metadata and embeddings) and by confusing the LLM in the Q&A process with irrelevant documents.

The following code aims at estimating the proportion of relevant / irrelevant documents collected by both versions of the scraper.

### Sample documents

In [4]:
# sample documents
sample_size_v1_htmls = 50
sample_size_v1_pdfs = 50
sample_size_v2_htmls = 50
sample_size_v2_pdfs = 50
v1_htmls_sample = pd.DataFrame({"doc": sorted(v1_htmls), "relevance": None}).sample(n=sample_size_v1_htmls, random_state=0).sort_index()
v1_pdfs_sample = pd.DataFrame({"doc": sorted(v1_pdfs), "relevance": None}).sample(n=sample_size_v1_pdfs, random_state=0).sort_index()
v2_htmls_sample = pd.DataFrame({"doc": sorted(v2_htmls), "relevance": None}).sample(n=sample_size_v2_htmls, random_state=0).sort_index()
v2_pdfs_sample = pd.DataFrame({"doc": sorted(v2_pdfs), "relevance": None}).sample(n=sample_size_v2_pdfs, random_state=0).sort_index()

In [5]:
display(v1_htmls_sample.head())
display(v1_pdfs_sample.head())
display(v2_htmls_sample.head())
display(v2_pdfs_sample.head())

Unnamed: 0,doc,relevance
1,www.uni-mannheim.de_en_academics_advice-and-se...,
2,www.uni-mannheim.de_en_academics_advice-and-se...,
3,www.uni-mannheim.de_en_academics_advice-and-se...,
4,www.uni-mannheim.de_en_academics_advice-and-se...,
6,www.uni-mannheim.de_en_academics_applying_the-...,


Unnamed: 0,doc,relevance
7,Aktualisiert_Stundenplan_MMDS_HWS2023.pdf,
8,Antrag_Anerkennung_B.Sc.Wifo_Stand_2023.pdf,
16,Appendix_M.Sc._Wifo_2023_24_09012024.pdf,
19,Bescheinigung_MMM_Englisch_de.pdf,
22,Flyer_Lehramt_Gymnasium_Uni_MA.pdf,


Unnamed: 0,doc,relevance
14,www.uni-mannheim.de_datascience_details_mcds-j...,
48,www.uni-mannheim.de_dws_news-archiv_dws-area-d...,
64,www.uni-mannheim.de_dws_news-archiv_dws-area-w...,
68,www.uni-mannheim.de_dws_news-archiv_dws-area-w...,
75,www.uni-mannheim.de_dws_news-archiv_dws-resear...,


Unnamed: 0,doc,relevance
14,08_31_Ringvorlesung_Data_Science_in_Action_HWS...,
27,160407_Praesentation_Wima_Master.pdf,
31,18-PerCom.pdf,
45,201800608_NO_EKiZi_.pdf,
55,2020.emnlp-main.4.pdf,


### Manually determine relevance of documents

#### HTMLs gathered by scraper version 1

In [6]:
i = 0
for doc in v1_htmls_sample.iloc[i: i+5, 0]:
    print(path_base_v1 + doc)

data/scraped_data/www.uni-mannheim.de_en_academics_advice-and-services_advice-for-students-with-disabilities-or-chronic-illnesses.html
data/scraped_data/www.uni-mannheim.de_en_academics_advice-and-services_advice-services-directory.html
data/scraped_data/www.uni-mannheim.de_en_academics_advice-and-services_changing-lanes.html
data/scraped_data/www.uni-mannheim.de_en_academics_advice-and-services_program-ambassadors.html
data/scraped_data/www.uni-mannheim.de_en_academics_applying_the-a-to-z-of-applying_confirming-you-are-still-eligible-to-take-exams-at-a-german-university.html


In [7]:
relevance = [
    1,
    1,
    1,
    1,
    1
]

v1_htmls_sample.iloc[i: i+5, 1] = relevance

In [8]:
i = 5
for doc in v1_htmls_sample.iloc[i: i+5, 0]:
    print(path_base_v1 + doc)

data/scraped_data/www.uni-mannheim.de_en_academics_applying_the-a-to-z-of-applying_masters-programs-foreign-language-requirements.html
data/scraped_data/www.uni-mannheim.de_en_academics_applying_the-a-to-z-of-applying_selection-statutes.html
data/scraped_data/www.uni-mannheim.de_en_academics_dates_academic-calendar.html
data/scraped_data/www.uni-mannheim.de_en_academics_during-your-studies_examinations.html
data/scraped_data/www.uni-mannheim.de_en_academics_during-your-studies_examinations_examination-regulations.html


In [9]:
relevance = [
    1,
    1,
    1,
    1,
    1
]

v1_htmls_sample.iloc[i: i+5, 1] = relevance

In [10]:
i = 10
for doc in v1_htmls_sample.iloc[i: i+5, 0]:
    print(path_base_v1 + doc)

data/scraped_data/www.uni-mannheim.de_en_academics_during-your-studies_examinations_extension-of-examination-deadlines.html
data/scraped_data/www.uni-mannheim.de_en_academics_during-your-studies_examinations_taking-an-examination.html
data/scraped_data/www.uni-mannheim.de_en_academics_during-your-studies_organizing-your-studies_disenrollment.html
data/scraped_data/www.uni-mannheim.de_en_academics_during-your-studies_organizing-your-studies_fees.html
data/scraped_data/www.uni-mannheim.de_en_academics_during-your-studies_organizing-your-studies_formalities.html


In [11]:
relevance = [
    1,
    1,
    1,
    1,
    1
]

v1_htmls_sample.iloc[i: i+5, 1] = relevance

In [12]:
i = 15
for doc in v1_htmls_sample.iloc[i: i+5, 0]:
    print(path_base_v1 + doc)

data/scraped_data/www.uni-mannheim.de_en_academics_during-your-studies_organizing-your-studies_parallel-studies.html
data/scraped_data/www.uni-mannheim.de_en_academics_during-your-studies_student-services_express-service.html
data/scraped_data/www.uni-mannheim.de_en_gender-equality-and-equal-opportunity_information-for-students.html
data/scraped_data/www.uni-mannheim.de_infos-fuer_studieninteressierte.html
data/scraped_data/www.uni-mannheim.de_infos-fuer_studieninteressierte_studienwahl_orientierungsberatung.html


In [13]:
relevance = [
    1,
    1,
    1,
    1,
    1
]

v1_htmls_sample.iloc[i: i+5, 1] = relevance

In [14]:
i = 20
for doc in v1_htmls_sample.iloc[i: i+5, 0]:
    print(path_base_v1 + doc)

data/scraped_data/www.uni-mannheim.de_infos-fuer_studieninteressierte_studienwahl_terminvereinbarung-alexandra-theobalt.html
data/scraped_data/www.uni-mannheim.de_studium_beratung-und-service.html
data/scraped_data/www.uni-mannheim.de_studium_beratung-und-service_spurwechsel.html
data/scraped_data/www.uni-mannheim.de_studium_bewerbung_bewerbung-von-a-bis-z_orientierungstest-lehrerorientierungstest.html
data/scraped_data/www.uni-mannheim.de_studium_bewerbung_bewerbung-von-a-bis-z_sprachkenntnisse-bachelorstudiengaenge.html


In [15]:
relevance = [
    1,
    1,
    1,
    1,
    1
]

v1_htmls_sample.iloc[i: i+5, 1] = relevance

In [16]:
i = 25
for doc in v1_htmls_sample.iloc[i: i+5, 0]:
    print(path_base_v1 + doc)

data/scraped_data/www.uni-mannheim.de_studium_bewerbung_bewerbung-von-a-bis-z_unbedenklichkeitsbescheinigung.html
data/scraped_data/www.uni-mannheim.de_studium_im-studium_pruefungen_pruefungsordnungen.html
data/scraped_data/www.uni-mannheim.de_studium_im-studium_studienorganisation.html
data/scraped_data/www.uni-mannheim.de_studium_im-studium_studienorganisation_beitraege-und-gebuehren.html
data/scraped_data/www.uni-mannheim.de_studium_im-studium_studienorganisation_beitraege-und-gebuehren_studien-gebuehren-fuer-internationale-studierende.html


In [17]:
relevance = [
    1,
    1,
    1,
    1,
    1
]

v1_htmls_sample.iloc[i: i+5, 1] = relevance

In [18]:
i = 30
for doc in v1_htmls_sample.iloc[i: i+5, 0]:
    print(path_base_v1 + doc)

data/scraped_data/www.uni-mannheim.de_studium_im-studium_studienorganisation_beurlaubung.html
data/scraped_data/www.uni-mannheim.de_studium_im-studium_studienorganisation_immatrikulation.html
data/scraped_data/www.wim.uni-mannheim.de_en_academics_organizing-your-studies_bsc-business-informatics.html
data/scraped_data/www.wim.uni-mannheim.de_en_academics_organizing-your-studies_bsc-business-informatics_extension-of-deadlines.html
data/scraped_data/www.wim.uni-mannheim.de_en_academics_organizing-your-studies_bsc-business-informatics_learning-agreements.html


In [19]:
relevance = [
    1,
    1,
    1,
    1,
    1
]

v1_htmls_sample.iloc[i: i+5, 1] = relevance

In [20]:
i = 35
for doc in v1_htmls_sample.iloc[i: i+5, 0]:
    print(path_base_v1 + doc)

data/scraped_data/www.wim.uni-mannheim.de_en_academics_organizing-your-studies_bsc-mathematics-in-business-and-economics.html
data/scraped_data/www.wim.uni-mannheim.de_en_academics_organizing-your-studies_mannheim-master-in-data-science_learning-agreements.html
data/scraped_data/www.wim.uni-mannheim.de_en_academics_organizing-your-studies_msc-business-informatics_extension-of-deadlines.html
data/scraped_data/www.wim.uni-mannheim.de_en_academics_organizing-your-studies_msc-business-informatics_general-questions.html
data/scraped_data/www.wim.uni-mannheim.de_en_academics_organizing-your-studies_msc-mathematics-in-business-and-economics.html


In [21]:
relevance = [
    1,
    1,
    1,
    1,
    1
]

v1_htmls_sample.iloc[i: i+5, 1] = relevance

In [22]:
i = 40
for doc in v1_htmls_sample.iloc[i: i+5, 0]:
    print(path_base_v1 + doc)

data/scraped_data/www.wim.uni-mannheim.de_en_academics_organizing-your-studies_teacher-education-programs.html
data/scraped_data/www.wim.uni-mannheim.de_schlather_teaching_aktuelle-semester.html
data/scraped_data/www.wim.uni-mannheim.de_studium_studienorganisation.html
data/scraped_data/www.wim.uni-mannheim.de_studium_studienorganisation_b-sc-wirtschaftsinformatik_anerkennung-von-pruefungsleistungen.html
data/scraped_data/www.wim.uni-mannheim.de_studium_studienorganisation_b-sc-wirtschaftsinformatik_fristverlaengerung.html


In [23]:
relevance = [
    1,
    1,
    1,
    1,
    1
]

v1_htmls_sample.iloc[i: i+5, 1] = relevance

In [24]:
i = 45
for doc in v1_htmls_sample.iloc[i: i+5, 0]:
    print(path_base_v1 + doc)

data/scraped_data/www.wim.uni-mannheim.de_studium_studienorganisation_lehramtsstudium.html
data/scraped_data/www.wim.uni-mannheim.de_studium_studienorganisation_m-sc-business-informatics_general-questions.html
data/scraped_data/www.wim.uni-mannheim.de_studium_studienorganisation_m-sc-wirtschaftsmathematik.html
data/scraped_data/www.wim.uni-mannheim.de_studium_studienorganisation_mannheim-master-in-data-science.html
data/scraped_data/www.wim.uni-mannheim.de_studium_studienorganisation_mannheim-master-in-data-science_recognition-of-coursework-and-examinations.html


In [25]:
relevance = [
    1,
    1,
    1,
    1,
    1
]

v1_htmls_sample.iloc[i: i+5, 1] = relevance

In [26]:
v1_htmls_sample["relevance"].value_counts()

relevance
1    50
Name: count, dtype: int64

#### PDFs gathered by scraper version 1

In [27]:
i = 0
for doc in v1_pdfs_sample.iloc[i: i+5, 0]:
    print(path_base_v1 + doc)

data/scraped_data/Aktualisiert_Stundenplan_MMDS_HWS2023.pdf
data/scraped_data/Antrag_Anerkennung_B.Sc.Wifo_Stand_2023.pdf
data/scraped_data/Appendix_M.Sc._Wifo_2023_24_09012024.pdf
data/scraped_data/Bescheinigung_MMM_Englisch_de.pdf
data/scraped_data/Flyer_Lehramt_Gymnasium_Uni_MA.pdf


In [28]:
relevance = [
    1,
    1,
    1,
    0,
    1
]

v1_pdfs_sample.iloc[i: i+5, 1] = relevance

In [29]:
i = 5
for doc in v1_pdfs_sample.iloc[i: i+5, 0]:
    print(path_base_v1 + doc)

data/scraped_data/Formular_Berufstaetigkeit_Employment_Verification_Form.pdf
data/scraped_data/Formular_Berufstaetigkeit_der_Eltern_Employment_Verification_Form_for_parents.pdf
data/scraped_data/Infografik_Aufbau_BEd_Lehramt_Gymnasium.pdf
data/scraped_data/MK_BEd_Informatik_2023_24.pdf
data/scraped_data/MK_BSc_Wifo_2023_24.pdf


In [30]:
relevance = [
    1,
    1,
    1,
    1,
    1
]

v1_pdfs_sample.iloc[i: i+5, 1] = relevance

In [31]:
i = 10
for doc in v1_pdfs_sample.iloc[i: i+5, 0]:
    print(path_base_v1 + doc)

data/scraped_data/MK_LAG_Mathe_2019_20.pdf
data/scraped_data/MK_MEd_Erweiterungsfach_Mathematik_2023_24.pdf
data/scraped_data/MK_MEd_Informatik_2019_20.pdf
data/scraped_data/Merkblatt_Sprachnachweise_Lehramt_Gymnasium.pdf
data/scraped_data/PO_82_276_H_2008.pdf


In [32]:
relevance = [
    1,
    1,
    1,
    1,
    1
]

v1_pdfs_sample.iloc[i: i+5, 1] = relevance

In [33]:
i = 15
for doc in v1_pdfs_sample.iloc[i: i+5, 0]:
    print(path_base_v1 + doc)

data/scraped_data/PO_88_105_H_2021.pdf
data/scraped_data/PO_88_278_H_2017.pdf
data/scraped_data/PO_88_278_H_2020.pdf
data/scraped_data/PO_MA_PolSci_Soc_2023_en.pdf
data/scraped_data/PO_MA_Sowi_PolSci_Soc_2017_en.pdf


In [34]:
relevance = [
    1,
    1,
    1,
    0,
    0
]

v1_pdfs_sample.iloc[i: i+5, 1] = relevance

In [35]:
i = 20
for doc in v1_pdfs_sample.iloc[i: i+5, 0]:
    print(path_base_v1 + doc)

data/scraped_data/PO_MA_Sowi_PolSci_Soc_2017_neu_2019_en.pdf
data/scraped_data/PO_MSc_MMDS_2017_2Satzung_EN.pdf
data/scraped_data/Studium_Studienangebot_B.Ed._Deutsch_Stundenplan_EN.pdf
data/scraped_data/Stundenplan_LAG_HWS22_Sem05_27092022.pdf
data/scraped_data/Stundenplan_Wima_FSS23_Wahlpflichtveranstaltungen.pdf


In [36]:
relevance = [
    0,
    1,
    0,
    1,
    1
]

v1_pdfs_sample.iloc[i: i+5, 1] = relevance

In [37]:
i = 25
for doc in v1_pdfs_sample.iloc[i: i+5, 0]:
    print(path_base_v1 + doc)

data/scraped_data/Stundenplan_Wima_HWS23_Wahlpflichtveranstaltungen.pdf
data/scraped_data/ZulImmaO_2019_en.pdf
data/scraped_data/fertig_Stundenplan_Wima_HWS23_Sem01.pdf
data/scraped_data/info_bsc.pdf
data/scraped_data/satzung_ba_cells.pdf


In [38]:
relevance = [
    1,
    1,
    1,
    1,
    0
]

v1_pdfs_sample.iloc[i: i+5, 1] = relevance

In [39]:
i = 30
for doc in v1_pdfs_sample.iloc[i: i+5, 0]:
    print(path_base_v1 + doc)

data/scraped_data/satzung_ba_germanistik.pdf
data/scraped_data/satzung_ba_kuwi_anglistik.pdf
data/scraped_data/satzung_ba_kuwi_philosophie.pdf
data/scraped_data/satzung_ba_vwl.pdf
data/scraped_data/satzung_ba_wifo.pdf


In [40]:
relevance = [
    0,
    0,
    0,
    0,
    1
]

v1_pdfs_sample.iloc[i: i+5, 1] = relevance

In [41]:
i = 35
for doc in v1_pdfs_sample.iloc[i: i+5, 0]:
    print(path_base_v1 + doc)

data/scraped_data/satzung_ba_wima.pdf
data/scraped_data/satzung_bed_lagpolitik.pdf
data/scraped_data/satzung_bed_lagspanisch.pdf
data/scraped_data/satzung_ma_accounting.pdf
data/scraped_data/satzung_ma_geschichte.pdf


In [42]:
relevance = [
    1,
    0,
    0,
    0,
    0
]

v1_pdfs_sample.iloc[i: i+5, 1] = relevance

In [43]:
i = 40
for doc in v1_pdfs_sample.iloc[i: i+5, 0]:
    print(path_base_v1 + doc)

data/scraped_data/satzung_ma_kuwi_mkw.pdf
data/scraped_data/satzung_ma_llm.pdf
data/scraped_data/satzung_ma_mcbl_en.pdf
data/scraped_data/satzung_ma_mkw.pdf
data/scraped_data/satzung_ma_psychologie.pdf


In [44]:
relevance = [
    0,
    0,
    0,
    0,
    0
]

v1_pdfs_sample.iloc[i: i+5, 1] = relevance

In [45]:
i = 45
for doc in v1_pdfs_sample.iloc[i: i+5, 0]:
    print(path_base_v1 + doc)

data/scraped_data/satzung_ma_soziologie_en.pdf
data/scraped_data/satzung_ma_sprachekommunikation.pdf
data/scraped_data/satzung_ma_taxation.pdf
data/scraped_data/satzung_ma_wifo_en.pdf
data/scraped_data/unbedenklichkeitsbescheinigung_en.pdf


In [46]:
relevance = [
    0,
    0,
    0,
    1,
    1
]

v1_pdfs_sample.iloc[i: i+5, 1] = relevance

In [47]:
v1_pdfs_sample["relevance"].value_counts()

relevance
1    28
0    22
Name: count, dtype: int64

#### HTMLs gathered by scraper version 2

In [48]:
i = 0
for doc in v2_htmls_sample.iloc[i: i+5, 0]:
    print(path_base_v2 + doc)

data/scraped_data_v2/www.uni-mannheim.de_datascience_details_mcds-jahresrueckblick-2022.html
data/scraped_data_v2/www.uni-mannheim.de_dws_news-archiv_dws-area-data-analytics_?tx_news_pi1%5Bcontroller%5D=News&tx_news_pi1%5BcurrentPage%5D=1&cHash=c4035cdb3949ace654999a0ffa7604b2.html
data/scraped_data_v2/www.uni-mannheim.de_dws_news-archiv_dws-area-web-data-mining-research.html
data/scraped_data_v2/www.uni-mannheim.de_dws_news-archiv_dws-area-web-data-mining-research_?tx_news_pi1%5Bcontroller%5D=News&tx_news_pi1%5BcurrentPage%5D=4&cHash=2d6383bf34d1b5d92d802650e9145edc.html
data/scraped_data_v2/www.uni-mannheim.de_dws_news-archiv_dws-research_?tx_news_pi1%5Bcontroller%5D=News&tx_news_pi1%5BcurrentPage%5D=3&cHash=04f7ec72201e448fdc61bcd0194086cc.html


In [49]:
relevance = [
    0,
    0,
    0,
    0,
    0
]

v2_htmls_sample.iloc[i: i+5, 1] = relevance

In [50]:
i = 5
for doc in v2_htmls_sample.iloc[i: i+5, 0]:
    print(path_base_v2 + doc)

data/scraped_data_v2/www.uni-mannheim.de_dws_news-archiv_dws-research_?tx_news_pi1%5Bcontroller%5D=News&tx_news_pi1%5BcurrentPage%5D=4&cHash=297eeacc5ecd1ce315c65b20e6d76a9f.html
data/scraped_data_v2/www.uni-mannheim.de_dws_news-archiv_dws-research_?tx_news_pi1%5Bcontroller%5D=News&tx_news_pi1%5BcurrentPage%5D=7&cHash=a4721ce21426b56ad94be65a5fa5baff.html
data/scraped_data_v2/www.uni-mannheim.de_dws_news_three-papers-accepted-for-caise-2021.html
data/scraped_data_v2/www.uni-mannheim.de_dws_people_administration_bianca-lermer.html
data/scraped_data_v2/www.uni-mannheim.de_dws_people_alumni_dr-timo-sztyler.html


In [51]:
relevance = [
    0,
    0,
    0,
    1, # ?
    0
]

v2_htmls_sample.iloc[i: i+5, 1] = relevance

In [52]:
i = 10
for doc in v2_htmls_sample.iloc[i: i+5, 0]:
    print(path_base_v2 + doc)

data/scraped_data_v2/www.uni-mannheim.de_dws_people_alumni_dr-yaser-oulabi.html
data/scraped_data_v2/www.uni-mannheim.de_dws_people_professors_prof-dr-ing-margret-keuper.html
data/scraped_data_v2/www.uni-mannheim.de_dws_people_researchers_phd-students_alexander-brinkmann.html
data/scraped_data_v2/www.uni-mannheim.de_dws_people_researchers_phd-students_robert-litschko.html
data/scraped_data_v2/www.uni-mannheim.de_dws_research_projects_rapidminer-linked-open-data-extension_example-predicting-the-fuel-consumption-of-cars.html


In [53]:
relevance = [
    0,
    1, # ?
    1, # ?
    1, # ?
    1  # ?
]

v2_htmls_sample.iloc[i: i+5, 1] = relevance

In [54]:
i = 15
for doc in v2_htmls_sample.iloc[i: i+5, 0]:
    print(path_base_v2 + doc)

data/scraped_data_v2/www.uni-mannheim.de_dws_teaching_course-details_courses-for-master-candidates_course-archive_fss-2022_ie-678-deep-learning.html
data/scraped_data_v2/www.uni-mannheim.de_dws_teaching_course-details_courses-for-master-candidates_cs-560-large-scale-data-management.html
data/scraped_data_v2/www.uni-mannheim.de_dws_teaching_course-details_courses-for-master-candidates_cs-647-image-processing.html
data/scraped_data_v2/www.uni-mannheim.de_dws_teaching_course-details_courses-for-master-candidates_ie-670-web-data-integration.html
data/scraped_data_v2/www.uni-mannheim.de_dws_teaching_course-details_courses-for-master-candidates_ie-678-deep-learning.html


In [55]:
relevance = [
    1,
    1,
    1,
    1,
    1
]

v2_htmls_sample.iloc[i: i+5, 1] = relevance

In [56]:
i = 20
for doc in v2_htmls_sample.iloc[i: i+5, 0]:
    print(path_base_v2 + doc)

data/scraped_data_v2/www.uni-mannheim.de_dws_teaching_course-details_courses-for-master-candidates_ie-694-industrial-applications-of-artificial-intelligence.html
data/scraped_data_v2/www.uni-mannheim.de_dws_teaching_course-details_courses-for-phd-candidates_computational-text-analysis.html
data/scraped_data_v2/www.uni-mannheim.de_dws_teaching_thesis-guidelines.html
data/scraped_data_v2/www.uni-mannheim.de_en_academics_during-your-studies_examinations_examination-regulations_examination-regulations-for-bachelors-pograms.html
data/scraped_data_v2/www.uni-mannheim.de_en_academics_during-your-studies_student-services_express-service_order-form.html


In [57]:
relevance = [
    1,
    1,
    1,
    1,
    1
]

v2_htmls_sample.iloc[i: i+5, 1] = relevance

In [58]:
i = 25
for doc in v2_htmls_sample.iloc[i: i+5, 0]:
    print(path_base_v2 + doc)

data/scraped_data_v2/www.uni-mannheim.de_en_research_doctorate_doctoral-studies-in-mannheim_regulations-and-procedures-governing-the-doctoral-dissertation.html
data/scraped_data_v2/www.uni-mannheim.de_engageeu.html
data/scraped_data_v2/www.uni-mannheim.de_finanzierung.html
data/scraped_data_v2/www.uni-mannheim.de_ines_forschung_publikationen.html
data/scraped_data_v2/www.uni-mannheim.de_ines_lehre_bachelor-und-masterarbeiten.html


In [59]:
relevance = [
    1,
    1,
    1,
    0, # ?
    1
]

v2_htmls_sample.iloc[i: i+5, 1] = relevance

In [60]:
i = 30
for doc in v2_htmls_sample.iloc[i: i+5, 0]:
    print(path_base_v2 + doc)

data/scraped_data_v2/www.uni-mannheim.de_ines_news_?tx_news_pi1%5Baction%5D=detail&tx_news_pi1%5Bcontroller%5D=News&tx_news_pi1%5Bnews%5D=13702&cHash=233d401c69cdc185727c317bd2e38a59.html
data/scraped_data_v2/www.uni-mannheim.de_ines_news_?tx_news_pi1%5Baction%5D=detail&tx_news_pi1%5Bcontroller%5D=News&tx_news_pi1%5Bnews%5D=18162&cHash=ae02051ea704d4ad058e28f07d0836fe.html
data/scraped_data_v2/www.uni-mannheim.de_lwda-2018_program_keynotes.html
data/scraped_data_v2/www.uni-mannheim.de_nachhaltigkeit_aktivitaeten_campusbetrieb_energiemanagement.html
data/scraped_data_v2/www.uni-mannheim.de_news_new-project-on-early-stage-diabetes-detection.html


In [61]:
relevance = [
    0,
    0,
    0,
    0,
    0
]

v2_htmls_sample.iloc[i: i+5, 1] = relevance

In [62]:
i = 35
for doc in v2_htmls_sample.iloc[i: i+5, 0]:
    print(path_base_v2 + doc)

data/scraped_data_v2/www.uni-mannheim.de_news_paper-accepted-for-aamas-2024.html
data/scraped_data_v2/www.uni-mannheim.de_newsroom_newsletter.html
data/scraped_data_v2/www.uni-mannheim.de_studium_beratung-und-service_beratungswegweiser_?tx_umaguidance_umaguidance%5Bcontroller%5D=UmaGuidance&tx_umaguidance_umaguidance%5BstudentStatus%5D=interested&cHash=61067b8c514ddc311aaaa038f3a6588a.html
data/scraped_data_v2/www.uni-mannheim.de_studium_beratung-und-service_uni-scouts_mannheim-master-in-management-mmm.html
data/scraped_data_v2/www.uni-mannheim.de_studium_beratung-und-service_uni-scouts_unternehmensjurist-in-kombinationsstudiengang-llb-staatsexamen.html


In [63]:
relevance = [
    0,
    0,
    1,
    0,
    0
]

v2_htmls_sample.iloc[i: i+5, 1] = relevance

In [64]:
i = 40
for doc in v2_htmls_sample.iloc[i: i+5, 0]:
    print(path_base_v2 + doc)

data/scraped_data_v2/www.uni-mannheim.de_studium_beratung-und-service_uni-scouts_wirtschaftsinformatik-bachelor.html
data/scraped_data_v2/www.uni-mannheim.de_studium_im-studium_studienbueros_express-service_bestellformular.html
data/scraped_data_v2/www.uni-mannheim.de_studium_im-studium_studienorganisation_formalitaeten.html
data/scraped_data_v2/www.uni-mannheim.de_studium_termine_semesterzeiten.html
data/scraped_data_v2/www.wim.uni-mannheim.de_en_academics_organizing-your-studies_bsc-business-informatics.html


In [65]:
relevance = [
    1,
    1,
    1,
    1,
    1
]

v2_htmls_sample.iloc[i: i+5, 1] = relevance

In [66]:
i = 45
for doc in v2_htmls_sample.iloc[i: i+5, 0]:
    print(path_base_v2 + doc)

data/scraped_data_v2/www.wim.uni-mannheim.de_en_academics_organizing-your-studies_mannheim-master-in-data-science_learning-agreements.html
data/scraped_data_v2/www.wim.uni-mannheim.de_en_international_information-for-outgoing-students_course-selection-and-recognition.html
data/scraped_data_v2/www.wim.uni-mannheim.de_en_news_career-fair-mint-marketplace-2023.html
data/scraped_data_v2/www.wim.uni-mannheim.de_internationales.html
data/scraped_data_v2/www.wim.uni-mannheim.de_studium_studienorganisation_mannheim-master-in-data-science.html


In [67]:
relevance = [
    1,
    1,
    0,
    1,
    1
]

v2_htmls_sample.iloc[i: i+5, 1] = relevance

In [68]:
v2_htmls_sample["relevance"].value_counts()

relevance
1    29
0    21
Name: count, dtype: int64

#### PDFs gathered by scraper version 2

In [69]:
i = 0
for doc in v2_pdfs_sample.iloc[i: i+5, 0]:
    print(path_base_v2 + doc)

data/scraped_data_v2/08_31_Ringvorlesung_Data_Science_in_Action_HWS23.pdf
data/scraped_data_v2/160407_Praesentation_Wima_Master.pdf
data/scraped_data_v2/18-PerCom.pdf
data/scraped_data_v2/201800608_NO_EKiZi_.pdf
data/scraped_data_v2/2020.emnlp-main.4.pdf


In [70]:
relevance = [
    1,
    1, # ?
    0,
    1, # ?
    0
]

v2_pdfs_sample.iloc[i: i+5, 1] = relevance

In [71]:
i = 5
for doc in v2_pdfs_sample.iloc[i: i+5, 0]:
    print(path_base_v2 + doc)

data/scraped_data_v2/2024_02_mzes_hiwi_gutfleisch.pdf
data/scraped_data_v2/3586163.pdf
data/scraped_data_v2/6_Laakso_Researching_Research.pdf
data/scraped_data_v2/DB_Exercise_12_Application_Development.pdf
data/scraped_data_v2/DM07-Parameter-Tuning-V1.pdf


In [72]:
relevance = [
    0,
    0,
    0,
    0,
    0
]

v2_pdfs_sample.iloc[i: i+5, 1] = relevance

In [73]:
i = 10
for doc in v2_pdfs_sample.iloc[i: i+5, 0]:
    print(path_base_v2 + doc)

data/scraped_data_v2/DM07-StudentProjects-FSS2021-Presentations.pdf
data/scraped_data_v2/ENGAGE_Formblatt_Hilfskraftmittel.pdf
data/scraped_data_v2/Evaluation_Web_Data_Integration_Bizer_HWS2017.pdf
data/scraped_data_v2/Flyer_Erstanlaufstelle_englisch_23_03_2021.pdf
data/scraped_data_v2/KG02-RDF-v1.pdf


In [74]:
relevance = [
    0, # ?
    0,
    0,
    1, # ?
    0
]

v2_pdfs_sample.iloc[i: i+5, 1] = relevance

In [75]:
i = 15
for doc in v2_pdfs_sample.iloc[i: i+5, 0]:
    print(path_base_v2 + doc)

data/scraped_data_v2/KG03-RDFS-v1.pdf
data/scraped_data_v2/KG07-Labeled-Property-Graphs-v2.pdf
data/scraped_data_v2/Learning_Agreement_Antragsteller_Bsc_Msc_Wifo___MMDS_Deckblatt.pdf
data/scraped_data_v2/LectureEvaluation_WebDataIntegration_BizerLehmberg_HWS2015.pdf
data/scraped_data_v2/LectureEvaluation_WebDataIntegration_BizerPaulheimBryl_HWS2013.pdf


In [76]:
relevance = [
    0,
    0,
    1,
    0,
    0
]

v2_pdfs_sample.iloc[i: i+5, 1] = relevance

In [77]:
i = 20
for doc in v2_pdfs_sample.iloc[i: i+5, 0]:
    print(path_base_v2 + doc)

data/scraped_data_v2/Matching_patterns.pdf
data/scraped_data_v2/MinIE%20Minimizing%20Facts%20in%20Open%20Information%20Extraction.pdf
data/scraped_data_v2/Mobilitaetsangebote_fuer_Studierende_AStA.pdf
data/scraped_data_v2/Nachhaltigkeit_Lehre_PM-23137.pdf
data/scraped_data_v2/OA_Exploring%20Semi-Supervised_sensors-18-02639.pdf


In [78]:
relevance = [
    0,
    0,
    0, # ? 
    0,
    0
]

v2_pdfs_sample.iloc[i: i+5, 1] = relevance

In [79]:
i = 25
for doc in v2_pdfs_sample.iloc[i: i+5, 0]:
    print(path_base_v2 + doc)

data/scraped_data_v2/PO_Kombinationsstudiengang_Rechtswissenschaft_JuSPO_2011_4Satzung.pdf
data/scraped_data_v2/PO_MA_MKW_2014_neu.pdf
data/scraped_data_v2/Schlicht11mapresolve.pdf
data/scraped_data_v2/Stellenausschreibung_DHBW_CyberSecurity_Dez23.pdf
data/scraped_data_v2/StuO_Gap_Year_2019.pdf


In [80]:
relevance = [
    0,
    0,
    0,
    0,
    0
]

v2_pdfs_sample.iloc[i: i+5, 1] = relevance

In [81]:
i = 30
for doc in v2_pdfs_sample.iloc[i: i+5, 0]:
    print(path_base_v2 + doc)

data/scraped_data_v2/Stuckenschmidt07appproximate.pdf
data/scraped_data_v2/VL_Semantic_Web_Technologies_Paulheim_HWS2016_correction.pdf
data/scraped_data_v2/ba_bf_romanistik_spanisch_2012.pdf
data/scraped_data_v2/beedkar16desq.pdf
data/scraped_data_v2/bigtalk.pdf


In [82]:
relevance = [
    0,
    0,
    0,
    0,
    0
]

v2_pdfs_sample.iloc[i: i+5, 1] = relevance

In [83]:
i = 35
for doc in v2_pdfs_sample.iloc[i: i+5, 0]:
    print(path_base_v2 + doc)

data/scraped_data_v2/dissertation_portisch.pdf
data/scraped_data_v2/gashteovski20-align.pdf
data/scraped_data_v2/gemulla07multisetsampling.pdf
data/scraped_data_v2/gemulla11dsgd.pdf
data/scraped_data_v2/l7_3_5_001.pdf


In [84]:
relevance = [
    0,
    0,
    0,
    0,
    0
]

v2_pdfs_sample.iloc[i: i+5, 1] = relevance

In [85]:
i = 40
for doc in v2_pdfs_sample.iloc[i: i+5, 0]:
    print(path_base_v2 + doc)

data/scraped_data_v2/makari13-gbm-slides.pdf
data/scraped_data_v2/o_142.pdf
data/scraped_data_v2/o_169.pdf
data/scraped_data_v2/om2011_proceedings.pdf
data/scraped_data_v2/p101-diete.pdf


In [86]:
relevance = [
    0,
    0,
    0,
    0,
    0
]

v2_pdfs_sample.iloc[i: i+5, 1] = relevance

In [87]:
i = 45
for doc in v2_pdfs_sample.iloc[i: i+5, 0]:
    print(path_base_v2 + doc)

data/scraped_data_v2/s10579-022-09581-9.pdf
data/scraped_data_v2/satzung_ba_kuwi_philosophie.pdf
data/scraped_data_v2/satzung_ma_mcl.pdf
data/scraped_data_v2/semantic-browser.pdf
data/scraped_data_v2/semstats_2013.pdf


In [88]:
relevance = [
    0,
    0,
    0,
    0,
    0
]

v2_pdfs_sample.iloc[i: i+5, 1] = relevance

In [89]:
v2_pdfs_sample["relevance"].value_counts()

relevance
0    45
1     5
Name: count, dtype: int64

### Save data

In [90]:
try:
    saved_files = os.listdir("data/content_relevance_data")
except FileNotFoundError:
    os.mkdir("data/content_relevance_data/")
if len(saved_files) == 0:
    v1_htmls_sample.to_csv("data/content_relevance_data/v1_htmls_sample.csv", index=False)
    v1_pdfs_sample.to_csv("data/content_relevance_data/v1_pdfs_sample.csv", index=False)
    v2_htmls_sample.to_csv("data/content_relevance_data/v2_htmls_sample.csv", index=False)
    v2_pdfs_sample.to_csv("data/content_relevance_data/v2_pdfs_sample.csv", index=False)

### Estimate proportion of relevant / non-relevant documents

In [91]:
p_hat_v1_htmls = v1_htmls_sample["relevance"].mean()
p_hat_v1_pdfs = v1_pdfs_sample["relevance"].mean()
p_hat_v2_htmls = v2_htmls_sample["relevance"].mean()
p_hat_v2_pdfs = v2_pdfs_sample["relevance"].mean()

print(f"Mean content relevance (HTMLs, scraper version 1): {p_hat_v1_htmls}")
print(f"Mean content relevance (PDFs, scraper version 1): {p_hat_v1_pdfs}")
print(f"Mean content relevance (HTMLs, scraper version 2): {p_hat_v2_htmls}")
print(f"Mean content relevance (PDFs, scraper version 2): {p_hat_v2_pdfs}")

Mean content relevance (HTMLs, scraper version 1): 1.0
Mean content relevance (PDFs, scraper version 1): 0.56
Mean content relevance (HTMLs, scraper version 2): 0.58
Mean content relevance (PDFs, scraper version 2): 0.1


In [92]:
# confidence intervals using normal approximation and alpha = 0.05
ci_v1_htmls = proportion_confint(count=v1_htmls_sample["relevance"].sum(), nobs=len(v1_htmls_sample), alpha=0.05)
ci_v1_pdfs = proportion_confint(count=v1_pdfs_sample["relevance"].sum(), nobs=len(v1_pdfs_sample), alpha=0.05)
ci_v2_htmls = proportion_confint(count=v2_htmls_sample["relevance"].sum(), nobs=len(v2_htmls_sample), alpha=0.05)
ci_v2_pdfs = proportion_confint(count=v2_pdfs_sample["relevance"].sum(), nobs=len(v2_pdfs_sample), alpha=0.05)

print(f"All confidence intervals below are calculated using normal approximation.\n")
print(f"CI with alpha=0.05 (HTMLs, scraper version 1): {ci_v1_htmls}")
print(f"CI with alpha=0.05 (PDFs, scraper version 1): {ci_v1_pdfs}")
print(f"CI with alpha=0.05 (HTMLs, scraper version 2): {ci_v2_htmls}")
print(f"CI with alpha=0.05 (PDFs, scraper version 2): {ci_v2_pdfs}")

All confidence intervals below are calculated using normal approximation.

CI with alpha=0.05 (HTMLs, scraper version 1): (1.0, 1.0)
CI with alpha=0.05 (PDFs, scraper version 1): (0.4224110866807189, 0.6975889133192812)
CI with alpha=0.05 (HTMLs, scraper version 2): (0.44319507547452175, 0.7168049245254782)
CI with alpha=0.05 (PDFs, scraper version 2): (0.01684577053901931, 0.1831542294609807)


In [93]:
def get_ci(k, n, N, conf_level=0.95):
    """
    Use this function to calculate the confidence interval for finite populations
    with normal approximation and finite population correction.

    :param k: number of successes in the sample
    :param n: sample size
    :param N: population size
    :param conf_level: confidence level
    """

    if conf_level == 0.95:
        z = 1.96
    elif conf_level == 0.9:
        z = 1.645
    elif conf_level == 0.98:
        z = 2.326
    else:
        print("Use confidence levels of 0.9, 0.95 or 0.98 only!")
        return None
    p = k / n
    term1 = ((p * (1 - p)) / n) ** 0.5
    fin_pop_corr = ((N - n) / (N - 1)) ** 0.5
    ci = p - (z * term1 * fin_pop_corr), p + (z * term1 * fin_pop_corr)
    return ci

In [94]:
# confidence intervals using normal approximation with finite population correction and alpha = 0.95
ci_fpc_v1_htmls = get_ci(k=v1_htmls_sample["relevance"].sum(), n=len(v1_htmls_sample), N=len(v1_htmls), conf_level=0.95)
ci_fpc_v1_pdfs = get_ci(k=v1_pdfs_sample["relevance"].sum(), n=len(v1_pdfs_sample), N=len(v1_pdfs), conf_level=0.95)
ci_fpc_v2_htmls = get_ci(k=v2_htmls_sample["relevance"].sum(), n=len(v2_htmls_sample), N=len(v2_htmls), conf_level=0.95)
ci_fpc_v2_pdfs = get_ci(k=v2_pdfs_sample["relevance"].sum(), n=len(v2_pdfs_sample), N=len(v2_pdfs), conf_level=0.95)

print(f"All confidence intervals below are calculated using normal approximation with finite population correction.\n")
print(f"CI with alpha=0.05 (HTMLs, scraper version 1): {ci_fpc_v1_htmls}")
print(f"CI with alpha=0.05 (PDFs, scraper version 1): {ci_fpc_v1_pdfs}")
print(f"CI with alpha=0.05 (HTMLs, scraper version 2): {ci_fpc_v2_htmls}")
print(f"CI with alpha=0.05 (PDFs, scraper version 2): {ci_fpc_v2_pdfs}")

All confidence intervals below are calculated using normal approximation with finite population correction.

CI with alpha=0.05 (HTMLs, scraper version 1): (1.0, 1.0)
CI with alpha=0.05 (PDFs, scraper version 1): (0.44539780944501983, 0.6746021905549803)
CI with alpha=0.05 (HTMLs, scraper version 2): (0.44831356769896935, 0.7116864323010306)
CI with alpha=0.05 (PDFs, scraper version 2): (0.01894544946190381, 0.18105455053809622)
