In [1]:
import pandas as pd
import re


# 01 verify every quote link is in csv for download


This verification was done in the file `03_verify_index_links.ipynb` and the results show that _all the links in the index are in the csv_ file for download.


# 02 Verify every link has been downloaded and every file exists


The file with the link are `acm_site.csv`, `missionary.csv` and `stdhndbk.csv`


In [2]:
# get the files "acm_site.csv" and "missionary.csv" and merge them into a single dataframe
acm_site = pd.read_csv("../data/data_09_12_24/index/acm.csv")
missionary = pd.read_csv("../data/data_09_12_24/index/missionary.csv")
stdhndbk = pd.read_csv("../data/data_09_12_24/index/stdhndbk.csv")

df_exist = pd.concat([acm_site, missionary, stdhndbk], ignore_index=True)

# sort by title
df_exist = df_exist.sort_values(by="Title")

df_exist.head(10)


Unnamed: 0,Section,Subsection,Title,URL
577,1. Background and Foundation,,"1.1 The History of PathwayConnect, Online Lear...",https://www.byupathway.edu/policies/handbook/1...
578,1. Background and Foundation,,1.2 Institutes of Religion & PEF/ Self-Relianc...,https://www.byupathway.edu/policies/handbook/1...
579,1. Background and Foundation,,1.3 Program Objectives,https://www.byupathway.edu/policies/handbook/1...
580,1. Background and Foundation,,1.4 Program Structure & Requirements,https://www.byupathway.edu/policies/handbook/1...
622,10. After PathwayConnect,,10. After PathwayConnect,https://www.byupathway.edu/policies/handbook/1...
623,10. After PathwayConnect,,10.1 Online Certificates and Degrees,https://www.byupathway.edu/policies/handbook/1...
624,10. After PathwayConnect,,10.2 Local Options,https://www.byupathway.edu/policies/handbook/1...
625,11. Communication and Marketing,,11.1 BYU-Pathway Worldwide Website,https://www.byupathway.edu/policies/handbook/1...
626,11. Communication and Marketing,,11.2 PATH,https://www.byupathway.edu/policies/handbook/1...
627,11. Communication and Marketing,,11.3 Email,https://www.byupathway.edu/policies/handbook/1...


In [3]:
def make_title(row):
    title = row["Title"]
    filename = title.replace(" ", "-")
    filename = re.sub(r"[^a-zA-Z-]", "", filename)

    return filename


In [4]:
# create a new column called "Local Source Name" with a format of
# the title, separeted by - and ".md" at the end
# also remove any special characters like !@#$%^&*()'_+ etc


df_exist["Local Source Name"] = df_exist.apply(make_title, axis=1)

# drop any duplicate row with the same Title and save it into a new dataframe
df_exist.drop_duplicates(subset="Title", inplace=True)

# longitud de la tabla
print("Files to be processed:", len(df_exist))


Files to be processed: 604


In [5]:
# read the names of the files in the directory "data/data_09_12_24/crawl/html" and "data/data_09_12_24/crawl/pdf"
import os

html_files = os.listdir("../data/data_09_12_24/crawl/html")
pdf_files = os.listdir("../data/data_09_12_24/crawl/pdf")

all_files = html_files + pdf_files

print("HTML files:", len(html_files))
print("PDF files:", len(pdf_files))
print("\n")
print("TOtal files:", len(html_files) + len(pdf_files), "plus 1 files as image.")


HTML files: 280
PDF files: 322


TOtal files: 602 plus 1 files as image.


In [6]:
all_files = set(all_files)

len(all_files)


602

In [7]:
def file_exist(name):
    for file in all_files:
        if name == file.split(".")[0]:
            return True
    return False


In [8]:
# Create a new column called "File Exists" that indicates if the file exists in the directory
df_exist["File Exists"] = df_exist["Local Source Name"].apply(file_exist)

# print only the rows where the file does not exist
df_exist[~df_exist["File Exists"]]


Unnamed: 0,Section,Subsection,Title,URL,Local Source Name,File Exists
191,Certificates & Degrees,Application Process,How do students access the BYUI application?,https://degreeapplication.byupathway.edu/,How-do-students-access-the-BYUI-application,False
434,Shepherding / Ministering,,Resources for New and Continuing Students,https://missionaries.prod.byu-pathway.psdops.c...,Resources-for-New-and-Continuing-Students,False


In [9]:
# show diuplicates Local Source Name
df_exist[df_exist.duplicated(subset="Local Source Name")]


Unnamed: 0,Section,Subsection,Title,URL,Local Source Name,File Exists
388,PathwayConnect (PC),PathwayConnect General Information,Learn About PC 102,https://missionaries.prod.byu-pathway.psdops.c...,Learn-About-PC-,True
389,PathwayConnect (PC),PathwayConnect General Information,Learn About PC 103,https://missionaries.prod.byu-pathway.psdops.c...,Learn-About-PC-,True
401,Policies,,Policy-PATH Naming Convention for Groups,https://missionaries.prod.byu-pathway.psdops.c...,Policy-PATH-Naming-Convention-for-Groups,True


In [10]:
# print len of existing files
print("Existing files:", len(df_exist[df_exist["File Exists"]]))


Existing files: 602


In [11]:
# Create an Array with the names of the files that do not exist
files_not_exist = df_exist[~df_exist["File Exists"]]["Local Source Name"].values


print(len(files_not_exist), "files do not exist")


2 files do not exist


132 files in the list doesn't exist in the download folder.


# 03 Verify if every file in quotes has been downloaded


In [12]:
filename = "../data/temporary/Index_quotes.csv"

df = pd.read_csv(
    filename,
    header=1,
)
df.index = range(1, len(df) + 1)

df.dropna(subset=["Link to Ideal Answer"], inplace=True)

# drop columns "Quotes", "Quote 2" and "Quote 3"
df.drop(columns=["Quotes", "Quote 2", "Quote 3"], inplace=True)

df_no_links = df[df["Link to Ideal Answer"].str.contains("http") == False]

# make a drop of the rows that have no link to ideal answer
df.drop(df_no_links.index, inplace=True)
df.head()


Unnamed: 0,Questions,Ideal Answer,Link to Ideal Answer,Link 2,Link 3
2,How do I know if a student has a scholarship?,Missionaries can see if a student has a schola...,https://missionaries.prod.byu-pathway.psdops.c...,,
3,How do I know if a student is registered for a...,There is not a way for Missionaries to verify ...,https://missionaries.prod.byu-pathway.psdops.c...,,
5,How do I know if student is member of the church?,Missionaries can see the Church membership sta...,https://missionaries.prod.byu-pathway.psdops.c...,,
6,What information should I track for each student?,The most important things Missionaries should ...,https://missionaries.prod.byu-pathway.psdops.c...,,
7,What to do If a student has already taken this...,For a student that is in a course that they ha...,https://pathway-missionary.powerappsportals.co...,https://missionaries.prod.byu-pathway.psdops.c...,


In [13]:
# Función para limpiar y extraer los links
def extract_links(text):
    # si el valor es NaN, retornar una lista vacía
    if pd.isnull(text):
        return []

    # Dividir el texto por los saltos de línea y filtrar solo los enlaces
    links = [
        line.strip() for line in text.split("\n") if line.strip().startswith("https")
    ]
    return links


# Recopila los link de "Link to Ideal Answer", "Link 2" y "Link 3" en un solo campo
def merge_links(row):
    links = row["Link to Ideal Answer"] + row["Link 2"] + row["Link 3"]
    return links


# because some questions has multiple links, we will split them into multiple rows with its respective question
def split_links(row):
    links = row["Links"]
    questions = row["Questions"]
    rows = []
    for link in links:
        rows.append([questions, link])
    return rows


In [14]:
# some of the rows has multiple links, we will convert them to a list
df["Link to Ideal Answer"] = df["Link to Ideal Answer"].apply(extract_links)
df["Link 2"] = df["Link 2"].apply(extract_links)
df["Link 3"] = df["Link 3"].apply(extract_links)

df["Links"] = df.apply(merge_links, axis=1)

list_of_links = df[["Questions", "Links"]]

# Apply the function to the dataframe and convert the result to a dataframe
df_links = list_of_links.apply(split_links, axis=1)
df_links = pd.DataFrame(df_links.sum(), columns=["Questions", "Links"])
df_links.head()


Unnamed: 0,Questions,Links
0,How do I know if a student has a scholarship?,https://missionaries.prod.byu-pathway.psdops.c...
1,How do I know if a student is registered for a...,https://missionaries.prod.byu-pathway.psdops.c...
2,How do I know if student is member of the church?,https://missionaries.prod.byu-pathway.psdops.c...
3,What information should I track for each student?,https://missionaries.prod.byu-pathway.psdops.c...
4,What to do If a student has already taken this...,https://pathway-missionary.powerappsportals.co...


Now with the links, verify if every of this resources has been downloaded.


In [15]:
# compare the Links from df_links to URL of df_exist, also File Exist must be True


def validate_if_exist(row):
    link = row["Links"]
    # verify if it exist in df_exist dataframe and its colum "File Exists" is True
    exist = df_exist[(df_exist["URL"].isin([link])) & (df_exist["File Exists"] == True)]
    return len(exist) > 0


df_links["File Exists"] = df_links.apply(validate_if_exist, axis=1)


In [16]:
# show all false values
df_some = df_links[~df_links["File Exists"]]

df_some.head()

# show all true values
# df_links[df_links["File Exists"]]


Unnamed: 0,Questions,Links,File Exists
56,How to get class members to be lead and observ...,https://studentsupportkb.byupathway.org/knowle...,False
105,Is there an exception request link for online ...,https://www.byui.edu/student-records/academic-...,False
107,How do students access the BYUI application?,https://degreeapplication.byupathway.edu/,False


The origin folder is `data/data_09_12_24/index`

About the 3 files above:

56. In the file exist 2 rows with the same title but different links, probably this is why we have the error, because the file is already downloaded. but is only 1 file with the title name, and 2 different sources associate to the same title.
57. In the file exist 2 rows with the same title but different links, probably this is why we have the error, because the file is already downloaded. but is only 1 file with the title name, and 2 different sources associate to the same title.
58. Error getting the page

To solve this, we must modify the download process to put a different name to the file if the file already exists. for the last error, the page doesn't exist.


# 04. Validate if Files were transformed to Markdown


In [17]:
import os


In [18]:
# origin

html_files = os.listdir("../data/data_09_12_24/crawl/html")
pdf_files = os.listdir("../data/data_09_12_24/crawl/pdf")

# remove the extension of the files
html_files = [file.split(".")[0] for file in html_files]
pdf_files = [file.split(".")[0] for file in pdf_files]


In [19]:
html_files_out = os.listdir("../data/data_09_12_24/out_sep_12/from_html")
pdf_files_out = os.listdir("../data/data_09_12_24/out_sep_12/from_pdf")

# drop txt files
html_files_out = [file for file in html_files_out if file.endswith(".md")]
pdf_files_out = [file for file in pdf_files_out if file.endswith(".md")]

# remove the extension of the files
html_files_out = [file.split(".")[0] for file in html_files_out]
pdf_files_out = [file.split(".")[0] for file in pdf_files_out]


In [20]:
# verify if the file exists in the directory "out_sep_12/from_html" or "out_sep_12/from_pdf", create a list of the files that do not exist
# rememeber to avoid the extension of the file
dont_exist_html = []
dont_exist_pdf = []

for file in html_files:
    if file.split(".")[0] not in html_files_out:
        dont_exist_html.append(file)

for file in pdf_files:
    if file.split(".")[0] not in pdf_files_out:
        dont_exist_pdf.append(file)


print("HTML files that do not exist:", len(dont_exist_html))
print("PDF files that do not exist:", len(dont_exist_pdf))


HTML files that do not exist: 0
PDF files that do not exist: 1


In [21]:
print("len of dont_exist_pdf:", len(dont_exist_pdf))
print("\n")
print(dont_exist_pdf)


len of dont_exist_pdf: 1


['Policy-Executive-Secretaries--Communication']


# Problems with whatsapp files


In [22]:
# get all the rows with links from whatsApp
df_links_whatsapp = df_exist[df_exist["URL"].str.contains("whatsapp")]


In [23]:
df_links_whatsapp


Unnamed: 0,Section,Subsection,Title,URL,Local Source Name,File Exists
509,WhatsApp,Android,Adding and Removing Group Members on Android,https://faq.whatsapp.com/841426356990637/?cms_...,Adding-and-Removing-Group-Members-on-Android,True
521,WhatsApp,Desktop,Adding and Removing Group Members on a Computer,https://faq.whatsapp.com/841426356990637/?help...,Adding-and-Removing-Group-Members-on-a-Computer,True
530,WhatsApp,iPhone,Adding and Removing Group Members on iPhone,https://faq.whatsapp.com/841426356990637/?cms_...,Adding-and-Removing-Group-Members-on-iPhone,True
503,WhatsApp,,Avoiding WhatsApp cancelling my account for spam?,https://faq.whatsapp.com/361005896189245?helpr...,Avoiding-WhatsApp-cancelling-my-account-for-spam,True
517,WhatsApp,Communication,Can't Send or Receive Messages,https://faq.whatsapp.com/5155925751185676/?hel...,Cant-Send-or-Receive-Messages,True
510,WhatsApp,Android,Downloading WhatsApp on Android,https://faq.whatsapp.com/807139050546238/?help...,Downloading-WhatsApp-on-Android,True
522,WhatsApp,Desktop,Downloading WhatsApp on a Computer,https://faq.whatsapp.com/1513589699119080/?hel...,Downloading-WhatsApp-on-a-Computer,True
531,WhatsApp,iPhone,Downloading WhatsApp on iPhone,https://faq.whatsapp.com/807139050546238/?cms_...,Downloading-WhatsApp-on-iPhone,True
505,WhatsApp,,How do I add students to a WhatsApp group?,https://faq.whatsapp.com/361005896189245?helpr...,How-do-I-add-students-to-a-WhatsApp-group,True
506,WhatsApp,,How do I create a WhatsApp group?,https://faq.whatsapp.com/3242937609289432/?hel...,How-do-I-create-a-WhatsApp-group,True
