In [1]:
from lxml import html
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

%load_ext lab_black

In [2]:
def create_series(x, sub=False):
    loc_list = []
    if sub == False:
        for i in range(len(text_list)):
            loc = re.findall(x, text_list[i])
            if len(loc) == 0:
                loc_list.append(np.nan)
            else:
                loc_list.append(loc[0])
    elif sub == True:
        for i in range(len(text_list)):
            loc = re.findall(x, re.sub(r"\(.*?\)", "", text_list[i]))
            if len(loc) == 0:
                loc_list.append(np.nan)
            else:
                loc_list.append(loc[0])
    loc_list = pd.Series(loc_list)
    return loc_list

In [309]:
def create_table(y):
    HtmlFile = open(y, "r", encoding="utf-8")
    source_code = HtmlFile.read()

    soup = BeautifulSoup(source_code, "html.parser")

    for foo in soup.find_all("div", attrs={"class": "text"}):
        text_list.append(foo.text)

    sub_list = create_series("(?<=ASSIGNMENT).*?(?= @)", True)
    town_list = create_series("(?<=@).*?(?=Details)")
    post_list = create_series("\d{6}")
    loc_list = create_series("(?<=Location:).*?(?=Duration)")
    dur_list = create_series("(?<=Duration:).*?(?:(?=Days)|(?=Timings))")
    day_list = create_series("(?:(?<=Days:)|(?<=Timings:)).*?(?=Fees)")
    fee_list = create_series("(?<=Fees:).*?(?=Tutor)")
    req_list = create_series("(?<=Requirements:).*?(?=Remarks)")
    rem_list = create_series("(?<=Remarks:).*?(?=.Interested)")

    data = pd.concat(
        [
            sub_list,
            town_list,
            post_list,
            dur_list,
            loc_list,
            day_list,
            fee_list,
            req_list,
            rem_list,
        ],
        axis=1,
    )
    data.columns = [
        "Subject",
        "Town",
        "Postal_Code",
        "Duration",
        "Address",
        "Day",
        "Fees",
        "Requirements",
        "Remarks",
    ]

    data = data.dropna(how="all")

    data_dur = data["Duration"].str.split("x", n=1, expand=True)
    data = pd.concat([data, data_dur], axis=1)
    data = data.select_dtypes(include="object").apply(lambda x: x.str.strip())
    data = data.drop(columns=["Duration"])
    data = data.rename(columns={0: "Frequency", 1: "Duration"})

    data["Duration"] = data["Duration"].str.lower()
    data["minute"] = data["Duration"].str.contains(r"min")
    data["hour"] = data["Duration"].str.contains(r"(hour|hr)")
    data["Duration"] = data["Duration"].str.extract(r"(\d*)")
    data["Frequency"] = data["Frequency"].str.extract(r"(\d*)")

    data["Duration"] = data["Duration"].str.strip()
    data["Frequency"] = data["Frequency"].str.strip()

    data["Frequency"] = data["Frequency"].replace("", 0)
    data["Duration"] = data["Duration"].replace("", 0)
    data["Duration"] = data["Duration"].fillna(0)
    data["Frequency"] = data["Frequency"].fillna(0)

    data["Frequency"] = data["Frequency"].astype(float)
    data["Duration"] = data["Duration"].astype(float)

    # data["Duration"] = data["Frequency"] * data["Duration"]

    data["Duration"] = np.where(
        (((data["minute"] == 1) & (data["hour"] == 0)) | (data["Duration"] >= 10)),
        data["Duration"] / 60,
        data["Duration"],
    )

    data["Fees"] = data["Fees"].str.extract(r"(\d*(?=[/−/-/]))")

    data["Day"] = data["Day"].str.lower()
    data["Day"] = data["Day"].str.findall(pat)
    data = data.dropna(subset=["Day"])
    data["Day"] = data["Day"].apply(lambda x: mapping(x))

    data["Requirements"] = data["Requirements"].str.lower()

    data["Female Req"] = data["Requirements"].str.contains(r"female")
    data["Male Req"] = data["Requirements"].str.contains(r"\bmale\b")
    data["MOE Req"] = data["Requirements"].str.contains(r"\bmoe\b")
    data["Undegrad Req"] = data["Requirements"].str.contains(r"undergrad")
    data["Grad Req"] = data["Requirements"].str.contains(r"graduate")
    data["Full Time Req"] = data["Requirements"].str.contains(r"full time")
    data["Part Time Req"] = data["Requirements"].str.contains(r"part time")
    data["Exp Req"] = data["Requirements"].str.contains(r"experience")

    data = data.drop_duplicates(subset="Postal_Code", keep="first")

    return data

In [310]:
msg_list = [
    "messages.html",
    "messages2.html",
    "messages3.html",
    "messages4.html",
    "messages5.html",
]

pat = r"({})".format("|".join(days.keys()))


def find_day(x):
    j = re.findall(pat, x)
    return ",".join(j)


def mapping(data):
    lst = []
    for y in data:
        num = days[y]
        if num == 8:
            num = np.random.randint(1, 6)
        elif num == 9:
            num = np.random.randint(6, 8)
        lst.append(num)
    return lst


days = {
    "mon": 1,
    "tues": 2,
    "wed": 3,
    "thurs": 4,
    "fri": 5,
    "sat": 6,
    "sun": 7,
    "weekday": 8,
    "weekend": 9,
}

text_list = []

In [311]:
data_comb = pd.DataFrame()

for i in msg_list:
    data = create_table(i)
    data_comb = pd.concat([data_comb, data])

data_comb = data_comb.reset_index()

In [312]:
data_comb.to_csv("data.csv", index=False)