# Parsing VP Kamala Harris' schedules from White House emails

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import altair as alt
import altair_latimes as lat
import glob
import os
import re

In [3]:
alt.themes.register("latimes", lat.theme)
alt.themes.enable("latimes")
pd.options.display.max_columns = 50
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()
pd.set_option("max_colwidth", None)

### Grab all the email files

In [4]:
path = "schedules"
email_files = glob.glob(os.path.join(path, "*.eml"))

### Loop over the list of files

In [5]:
emails = []

for f in email_files:
    emails.append(pd.read_html(f, header=None))

### Grab only the body of the message related to travel

In [6]:
email_texts = []

for t in emails:
    s = str(t)
    result = re.search("FOR IMMEDIATE RELEASE (.*)###", s)
    email_texts.append(result.group(1))

### Read that raw text into a frame we can parse

In [7]:
df = pd.DataFrame(email_texts)

In [8]:
df.rename(columns={0: "original_email_text"}, inplace=True)

In [9]:
df["text_to_parse"] = df["original_email_text"]

### Does the travel involve travel or the VP's husband?

In [10]:
df["involve_travel"] = df.original_email_text.str.contains("will travel")

In [11]:
df["involve_los_angeles"] = df.original_email_text.str.contains("Los Angeles")

In [12]:
df["involve_oakland"] = df.original_email_text.str.contains("Oakland")

In [13]:
df["involve_california"] = df.original_email_text.str.contains("California")

In [14]:
df["involve_second_gentleman"] = df.original_email_text.str.contains("SECOND GENTLEMAN")

### Parsing characters and phrases for splitting the narrative into event-related columns

In [15]:
df.text_to_parse = (
    df.text_to_parse.str.replace("PRES= IDENT", "PRESIDENT", regex=False)
    .str.replace("=E2=80=99", "'", regex=False)
    .str.replace("= ", "", regex=False)
    .str.replace("=92", "'", regex=False)
    .str.replace("=C3=A1", "á", regex=False)
    .str.replace(
        "DAILY GUIDANCE FOR THE VICE PRESIDENT AND SECOND GENTLEMAN", "|", regex=False
    )
    .str.replace("DAILY GUIDANCE FOR THE SECOND GENTLEMAN", "|", regex=False)
    .str.replace("DAILY GUIDANCE FOR THE SECOND GENTLEMAN ", "|", regex=False)
    .str.replace("DAILY GUIDANCE FOR THE VICE PRESIDENT", "|", regex=False)
    .str.replace(" At ", "|", regex=False)
    .str.replace(" On ", "|", regex=False)
    .str.replace(", 2021 ", ", 2021|", regex=False)
    .str.replace("| |", "|", regex=False)
    .str.replace("||", "|", regex=False)
)

### Split the events paragraph into columns

In [16]:
df[
    [
        "email_date",
        "events_date",
        "event1",
        "event2",
        "event3",
        "event4",
        "event5",
        "event6",
        "event7",
    ]
] = df.text_to_parse.str.split("|", expand=True)

### How many of the event columns actually have events?

In [17]:
cols = ["event1", "event2", "event3", "event4", "event5", "event6", "event7"]

In [18]:
df["daily_events_count"] = df[cols].count(axis=1)

### Cleaning up dates

In [19]:
df["events_date"] = (
    df["events_date"]
    .str.replace("FOR", "", regex=False)
    .str.strip("")
    .str.title()
    .str.replace(", 2021", "", regex=False)
)

In [20]:
df[["day_of_week", "events_date"]] = df["events_date"].str.split(", ", expand=True)

In [21]:
df["day_of_week"] = df["day_of_week"].str.strip("")

In [22]:
df["email_date"] = pd.to_datetime(df["email_date"])

In [23]:
df["events_date"] = pd.to_datetime(df["events_date"] + str(", 2021"))

In [24]:
len(df)

63

In [25]:
df = df.drop_duplicates()

---

### How many events involve travel? 

In [26]:
travel = df[df["involve_travel"] == True].copy()

In [27]:
len(travel)

8

### Rudementary place parser

In [28]:
df["place_travel"] = df["text_to_parse"].str.extract("will travel to ([^.|,]*)")

---

### Get our dataframe in order

In [33]:
df = df[
    [
        "email_date",
        "events_date",
        "daily_events_count",
        "day_of_week",
        "involve_travel",
        "involve_california",
        "involve_los_angeles",
        "involve_oakland",
        "place_travel",
        "involve_second_gentleman",
        "event1",
        "event2",
        "event3",
        "event4",
        "event5",
        "event6",
        "event7",
        "original_email_text",
        "text_to_parse",
    ]
]

KeyError: "['involve_oakland'] not in index"

---

## Toplines

In [30]:
california = len(df[df["involve_california"] == True])

In [31]:
la = len(df[df["involve_los_angeles"] == True])

In [32]:
oakland = len(df[df["involve_oakland"] == True])

KeyError: 'involve_oakland'

In [None]:
travel = len(df[df["involve_travel"] == True])

In [None]:
print(
    "\n\nVice President Harris has had at least "
    + str(travel)
    + " events related to travel on her public schedule released by the White House. \n\nAt least "
    + str(la)
    + " have involved Los Angeles. At least "
    + str(la)
    + " have involved Oakland."
)

---

## Charts

### Does she work weekends? 

In [None]:
bar_chart = (
    alt.Chart(df)
    .mark_bar(opacity=1)
    .encode(
        x=alt.X(
            "events_date",
            title="",
            axis=alt.Axis(format="%b. %-d", tickCount=5, grid=False),
        ),
        y=alt.Y(
            "daily_events_count",
            title=" ",
            stack=None,
            axis=alt.Axis(
                tickSize=0,
                domainOpacity=0,
                tickCount=6,
                offset=4,
                gridWidth=0.6,
                gridColor="#dddddd",
            ),
            scale=alt.Scale(domain=(0, 8)),
        ),
    )
)


chart_kamala_events = (
    (bar_chart)
    .properties(
        width=600,
        height=400,
        title="VP Kamala Harris events per day",
    )
    .configure_view(strokeOpacity=0)
    .configure_legend(orient="top", symbolType="square")
    .configure_axis(labelFontSize=13)
)

chart_kamala_events

---

## Exports

In [None]:
chart_kamala_events.save("chart_kamala_events.png", scale_factor=1)

In [None]:
df.sort_values("email_date").to_csv("data/processed/all_events.csv", index=False)

In [None]:
df["original_email_text"].to_csv("data/raw/original_email_texts.csv", index=False)