## CODE INFORMATION:

This document is used to store all the code I needed to download all the data I needed to make my lists of the most delayed trains by net delay and delay per mile between July 2021 and June 2022. If you're looking to use it, it's probably better to just go through FindEveryPossibleAmtrakTrain_Q421_Q322.ipynb.

In [19]:
import requests
from matplotlib import pyplot as plt
import numpy as np
import os
import math

# Get the station delay data for all real train numbers

In [20]:
# Set ranges of all the train numbers
trainNumbers = [1, 9999]

In [21]:
# Download the files we need for each train
def downLoadFiles(trainNumbers, startMonth, startDay, startYear, endMonth, endDay, endYear, statistic, folder):

    # Look through all trains
    for x in range(int(len(trainNumbers)/2)):
        for i in range(trainNumbers[2*x], trainNumbers[2*x+1]+1):
            
            print(i)

            # For if you only want a specific train file
            # if (i != 5):
                # continue

            # Make URL and get page
            URL = "https://juckins.net/amtrak_status/archive/html/average_delays.php?train_num="
            URL = URL + str(i)
            URL = URL + "&date_start="
            URL = URL + startMonth + "%2F" + startDay + "%2F" + startYear
            URL = URL + "&date_end=" 
            URL = URL + endMonth + "%2F" + endDay + "%2F" + endYear
            URL = URL + "&df1=1&df2=1&df3=1&df4=1&df5=1&df6=1&df7=1&stat="
            URL = URL + statistic + "&chartsize=2&dfon=1"
            page = requests.get(URL)

            # Make sure train number exists
            toFind1 = "<p class=\"content-red\">Train number not found.<br>"
            searching1 = page.text.find(toFind1)
            toFind2 = "<p class=\"content-red\">Invalid train number</p>"
            searching2 = page.text.find(toFind2)
            toFind3 = "<p class=\"content-red\">No data found; try different dates.<br>"
            searching3 = page.text.find(toFind3)
            if searching1 == -1 and searching2 == -1 and searching3 == -1:
                fileName = folder + "/amtrakTrain" + str(i) + ".txt"
                with open(fileName, 'w') as f:
                    f.write(page.text)
        


In [22]:
# Set variables and download files
# avg or med for statistic
start_month = "07"
start_day = "01"
start_year = "2021"
end_month = "06"
end_day = "30"
end_year = "2022"
statistic = "med"
folder = "21Q4_22Q4_AverageDelayByStation"
downLoadFiles(trainNumbers, start_month, start_day, start_year, end_month, end_day, end_year, statistic, folder)

# Get the latitude and longitude of every Amtrak station

In [26]:
# Make the list of all stations with their city

def findStations():
    URL = "https://juckins.net/amtrak_status/archive/html/stations.php"
    page = requests.get(URL)
    text = page.text
    text = text[text.find("<tr><td style=\"text-align:left; font-style: italic;\">Code</td><td style=\"text-align:left; font-style:italic\">Name</td></tr>") + 1:]
    text = text[text.find("<tr>"):text.find("</table><br>")]
    
    stationDict = { }
    findStation = text.find("<tr>")
    while findStation != -1:
        stationName = text[findStation+8:findStation+11]
        stationAddress = text[findStation+20:text.find("</td>", findStation+20)]
        findStation = text.find("<tr>", findStation + 1)
        stationDict[stationName] = [stationAddress]
        
    return stationDict


In [27]:
# Use the google maps API to get the latitude and longitude of each station

def getStationLatLong(stations):
    
    # GOOGLE_MAPS_API_KEY = ENTER API KEY IF NEEDED
    
    for station in stations:
        address = stations[station][0]
        address = address.replace(", ", "+")
        address = address.replace(" - ", "+")
        address = address.replace(" ", "+")
        address = "Amtrak+Station+" + address
        response = requests.get('https://maps.googleapis.com/maps/api/geocode/json?address=' + address + '&key=' + GOOGLE_MAPS_API_KEY)
        resp_json_payload = response.json()
        if len(resp_json_payload['results']) > 0:   
            latLong = resp_json_payload['results'][0]['geometry']['location']
            lat = latLong['lat']
            long = latLong['lng']
            if len(stations[station]) == 1:
                stations[station].append(lat)
                stations[station].append(long)
            else:
                stations[station][1] = lat
                stations[station][2] = long
    


In [58]:
stationLocations = findStations()
getStationLatLong(stationLocations)

# Download the delay data for every train

In [33]:
# Download the delay data needed
def downLoadDelayData(t_num, dest, month, day, year, folder):

    # One of these 2 must exist, other can be ""
    # Number can be entered as "30*", "2100-2200", "3,4,5,6" or "all"
    number = str(t_num)
    station = dest

    # Must all exist
    start_month = month
    start_day = day
    start_year = year

    # Don't have to exist if only looking at one day
    # If only looking at one day, can be ""
    end_month = "06"
    end_day = "30"
    end_year = "2022"

    # All these must exist
    sunday = "1"
    monday = "1"
    tuesday = "1"
    wednesday = "1"
    thursday = "1"
    friday = "1"
    saturday = "1"

    # Must exist
    # Should be "d_dp" or "d_ar"
    delay_type = "d_ar"

    # Must exist
    # "DESC" or "ASC"
    order = "DESC"

    # Must exist
    # "gt", "gteq", "eq", "lteq", "lt"
    sign = "gt"

    # Optional, can be ""
    minutes = ""

    # Keep at 1
    # Setting to zero ignores weekday inputs
    # Set to zero if you always want every day of the week
    dfon = "1"

    URL = "https://juckins.net/amtrak_status/archive/html/history.php?"
    URL = URL + "train_num=" + number
    URL = URL + "&station=" + station
    URL = URL + "&date_start=" + start_month + "%2F" + start_day + "%2F" + start_year
    URL = URL + "&date_end=" + end_month + "%2F" + end_day + "%2F" + end_year
    URL = URL + "&df1=" + sunday + "&df2=" + monday + "&df3=" + tuesday + "&df4=" + wednesday + "&df5=" + thursday + "&df6=" + friday + "&df7=" + saturday
    URL = URL + "&sort=" + delay_type + "&sort_dir=" + order
    URL = URL + "&co=" + sign + "&limit_mins=" + minutes + "&dfon=" + dfon
    page = requests.get(URL)
    
    print(number)
    fileName = folder + "/amtrakTrain" + number + ".txt"
    with open(fileName, 'w') as f:
        f.write(page.text)

    

In [34]:
start_month = "07"
start_day = "01"
start_year = "2021"
folder2 = "21Q4_22Q3_DestinationDelays"
for trainNum in trainDict:
    downLoadDelayData(trainNum, trainDict[trainNum][1], start_month, start_day, start_year, folder2)