## CODE INFORMATION:

Given 1 train, its destination, and 2 time frames, this code will download the delay data for the train at its destination in both time frames, and run a 2-sample t-test with the data in both time frames to see if the train was significantly more delayed within either of the two timeframes.

In [14]:
import requests
from matplotlib import pyplot as plt
import numpy as np
import os
import math

# Get Delays

In [15]:
# Get the stats we need from the files
def getDelays(trainNumber, endCity, start_month, start_day, start_year, end_month, end_day, end_year):
  
    delaysByTrain = { }
    
    # One of these 2 must exist, other can be ""
    # Number can be entered as "30*", "2100-2200", "3,4,5,6" or "all"
    number = str(trainNumber)
    station = endCity

    # Must all exist
    start_month = str(start_month)
    start_day = str(start_day)
    start_year = str(start_year)

    # Don't have to exist if only looking at one day
    # If only looking at one day, can be ""
    end_month = str(end_month)
    end_day = str(end_day)
    end_year = str(end_year)

    # All these must exist
    sunday = "1"
    monday = "1"
    tuesday = "1"
    wednesday = "1"
    thursday = "1"
    friday = "1"
    saturday = "1"

    # Must exist
    # Should be "d_dp" or "d_ar"
    delay_type = "d_ar"

    # Must exist
    # "DESC" or "ASC"
    order = "DESC"

    # Must exist
    # "gt", "gteq", "eq", "lteq", "lt"
    sign = "gt"

    # Optional, can be ""
    minutes = ""

    # Keep at 1
    # Setting to zero ignores weekday inputs
    # Set to zero if you always want every day of the week
    dfon = "1"

    URL = "https://juckins.net/amtrak_status/archive/html/history.php?"
    URL = URL + "train_num=" + number
    URL = URL + "&station=" + station
    URL = URL + "&date_start=" + start_month + "%2F" + start_day + "%2F" + start_year
    URL = URL + "&date_end=" + end_month + "%2F" + end_day + "%2F" + end_year
    URL = URL + "&df1=" + sunday + "&df2=" + monday + "&df3=" + tuesday + "&df4=" + wednesday + "&df5=" + thursday + "&df6=" + friday + "&df7=" + saturday
    URL = URL + "&sort=" + delay_type + "&sort_dir=" + order
    URL = URL + "&co=" + sign + "&limit_mins=" + minutes + "&dfon=" + dfon
    page = requests.get(URL)
    
    listDelays = []
    toFind = "</span></td><td style=\"text-align: center;\">"
    length = len(toFind)
    searching = page.text.find(toFind)
    while (searching != -1) :
        end = page.text.find("<", searching + length)
        delayNumber = int(page.text[searching+length:end])
        listDelays.append(delayNumber)
        searching = page.text.find(toFind, end)
        
    return listDelays

# Functions

In [16]:
# Import library
import scipy.stats as stats

In [17]:
# Use Scipy to get the p-value
def getPValue(train1Delays, train2Delays):
    train1Data = np.asarray(train1Delays)
    train2Data = np.asarray(train2Delays)
    train1Mean = train1Data.mean()
    train2Mean = train2Data.mean()

    test = stats.ttest_ind(a=train1Data, b=train2Data, equal_var=False)
    statistic = test[0]
    p_value = test[1]

    print("Test Statistic:", statistic)
    print("P-value:", p_value)
    if (train1Mean > train2Mean):
        print("If the p-value is sufficiently small,\n time frame #1 is significantly more delayed than time frame #2 on average.")
    else:
        print("If the p-value is sufficiently small,\n time frame #2 is significantly more delayed than time frame #1 on average.")

In [18]:
# Use math I have learned in ORF245 to get the p-value
def myPValue(train1Delays, train2Delays):

    train1Data = np.asarray(train1Delays)
    train2Data = np.asarray(train2Delays)
    train1Mean = train1Data.mean()
    train2Mean = train2Data.mean()


    if (len(train1Delays) >= 40 and len(train2Delays) >= 40):
        myZ = (train2Mean-train1Mean)/(np.sqrt(train1Data.var(ddof=1)/len(train1Delays) + train2Data.var(ddof=1)/len(train2Delays)))
        
        df_top = (train1Data.var(ddof=1)/len(train1Delays) + train2Data.var(ddof=1)/len(train2Delays))**2
        df_bottom_1 = 1/(len(train1Delays) - 1) * (train1Data.var(ddof=1)/len(train1Delays))**2
        df_bottom_2 = 1/(len(train2Delays) - 1) * (train2Data.var(ddof=1)/len(train2Delays))**2
        df_true = df_top / (df_bottom_1 + df_bottom_2)
        
        myP = (1 - stats.t.cdf(abs(myZ), df = df_true))*2
        
        print("My Test Statistic:", myZ)
        print("My P-value:", myP)
        
        if (train1Mean > train2Mean):
            print("If the p-value is sufficiently small,\n time frame #1 is significantly more delayed than time frame #2 on average.")
        else:
            print("If the p-value is sufficiently small,\n time frame #2 is significantly more delayed than time frame #1 on average.")
    else:
        print("One of the trains has too small a sample size!")

# Inputs - 2 Trains/Routes Must Be Independent Samples

In [19]:
trainNumber = 5
endCity = "EMY"

start_month1 = "07"
start_day1 = "01"
start_year1 = "2021"
end_month1 = "06"
end_day1 = "30"
end_year1 = "2022"
train1Delays = getDelays(trainNumber, endCity, start_month1, start_day1, start_year1, end_month1, end_day1, end_year1)

start_month2 = "07"
start_day2 = "01"
start_year2 = "2022"
end_month2 = "12"
end_day2 = "31"
end_year2 = "2023"
train2Delays = getDelays(trainNumber, endCity, start_month2, start_day2, start_year2, end_month2, end_day2, end_year2)

In [20]:
print(train1Delays)
print(train2Delays)

[1142, 1015, 998, 848, 773, 771, 766, 736, 685, 676, 663, 587, 510, 505, 458, 441, 435, 434, 427, 416, 415, 405, 383, 370, 364, 364, 362, 351, 348, 344, 334, 334, 333, 324, 322, 319, 314, 301, 295, 294, 290, 280, 280, 277, 273, 263, 263, 263, 260, 254, 250, 243, 237, 237, 235, 232, 232, 231, 230, 227, 224, 223, 221, 220, 220, 218, 218, 218, 217, 216, 213, 213, 212, 212, 211, 211, 209, 207, 205, 203, 202, 198, 196, 195, 194, 192, 190, 190, 186, 186, 185, 184, 183, 183, 180, 179, 178, 175, 172, 172, 170, 170, 169, 167, 166, 165, 164, 162, 162, 162, 161, 160, 160, 159, 159, 155, 154, 149, 148, 144, 141, 138, 136, 135, 133, 133, 133, 131, 128, 127, 126, 122, 121, 120, 118, 117, 116, 116, 108, 107, 107, 107, 106, 105, 105, 105, 105, 103, 103, 102, 100, 99, 99, 98, 98, 97, 97, 95, 94, 94, 93, 92, 92, 92, 92, 92, 89, 86, 86, 84, 83, 83, 82, 82, 81, 81, 80, 79, 78, 76, 75, 75, 74, 74, 71, 71, 70, 68, 68, 67, 67, 66, 66, 66, 64, 63, 62, 61, 60, 58, 57, 56, 55, 55, 54, 54, 53, 51, 51, 50, 50, 49

# Results

In [21]:
print("Train #: Train", trainNumber)
print("Destination:", endCity)
print("Time Frame 1:", 
      start_month1 + "/" + start_day1 + "/" + start_year1 + " - " + end_month1 + "/" + end_day1 + "/" + end_year1)
print("Time Frame #1 Average Delay:", np.mean(train1Delays), "Minutes")
print("Time Frame 2:", 
      start_month2 + "/" + start_day2 + "/" + start_year2 + " - " + end_month2 + "/" + end_day2 + "/" + end_year2)
print("Time Frame #2 Average Delay:", np.mean(train2Delays), "Minutes")
getPValue(train1Delays, train2Delays)

Train #: Train 5
Destination: EMY
Time Frame 1: 07/01/2021 - 06/30/2022
Time Frame #1 Average Delay: 138.80817610062894 Minutes
Time Frame 2: 07/01/2022 - 12/31/2023
Time Frame #2 Average Delay: 179.952 Minutes
Test Statistic: -2.4927566980404015
P-value: 0.012999969326783621
If the p-value is sufficiently small,
 time frame #2 is significantly more delayed than time frame #1 on average.


In [22]:
print("Train #: Train", trainNumber)
print("Destination:", endCity)
print("Time Frame 1:", 
      start_month1 + "/" + start_day1 + "/" + start_year1 + " - " + end_month1 + "/" + end_day1 + "/" + end_year1)
print("Time Frame #1 Average Delay:", np.mean(train1Delays), "Minutes")
print("Time Frame 2:", 
      start_month2 + "/" + start_day2 + "/" + start_year2 + " - " + end_month2 + "/" + end_day2 + "/" + end_year2)
print("Time Frame #2 Average Delay:", np.mean(train2Delays), "Minutes")
myPValue(train1Delays, train2Delays)

Train #: Train 5
Destination: EMY
Time Frame 1: 07/01/2021 - 06/30/2022
Time Frame #1 Average Delay: 138.80817610062894 Minutes
Time Frame 2: 07/01/2022 - 12/31/2023
Time Frame #2 Average Delay: 179.952 Minutes
My Test Statistic: 2.4927566980404015
My P-value: 0.012999969326783711
If the p-value is sufficiently small,
 time frame #2 is significantly more delayed than time frame #1 on average.
