In [223]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy as sp 
plt.rcParams["figure.figsize"] = (7,7)

In [224]:
df = pd.read_csv('project_data.csv')
df.drop('Unnamed: 0', axis=1, inplace=True) # Removing a weird CSV formatting thing.

In [225]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1278986 entries, 0 to 1278985
Data columns (total 15 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   starttime             1278986 non-null  object 
 1   duration              1278986 non-null  int64  
 2   bikenum               1273703 non-null  float64
 3   station_start         1278986 non-null  int64  
 4   start_addy            1278986 non-null  object 
 5   addy_end              1278986 non-null  object 
 6   station_end           1278986 non-null  int64  
 7   member                1278986 non-null  bool   
 8   days_since_Jan1_2010  1278986 non-null  int64  
 9   day_of_week           1278986 non-null  object 
 10  month                 1278986 non-null  int64  
 11  date                  1278986 non-null  object 
 12  route                 1278986 non-null  object 
 13  temperature           1278986 non-null  float64
 14  conditions            1278986 non-

In [226]:
df[['duration', 'route']] # Key variables to look at.

Unnamed: 0,duration,route
0,1012,M St & New Jersey Ave SE 4th & M St SW
1,2690,5th & K St NW 19th St & Pennsylvania Ave NW
2,1406,5th & K St NW Park Rd & Holmead Pl NW
3,1413,19th St & Pennsylvania Ave NW 15th & P St NW
4,982,7th & T St NW Massachusetts Ave & Dupont Circl...
...,...,...
1278981,300,15th & P St NW 17th & Corcoran St NW
1278982,387,Convention Center / 7th & M St NW 15th & P St NW
1278983,261,Lamont & Mt Pleasant NW Park Rd & Holmead Pl NW
1278984,2060,21st & I St NW New York Ave & 15th St NW


In [227]:
df['date'] # Compare (2010-09-20 , 2010-12-31) to (2011-09-20 , 2011-12-31)

0          2010-09-20
1          2010-09-20
2          2010-09-20
3          2010-09-20
4          2010-09-20
              ...    
1278981    2011-12-31
1278982    2011-12-31
1278983    2011-12-31
1278984    2011-12-31
1278985    2011-12-31
Name: date, Length: 1278986, dtype: object

In [228]:
# Step one: find routes that have increase duration over time while holding constant seasonality differences. 

df = df[df['route'].isin(df['route'].value_counts()[:500].index)] # This reduces the search space to the top 500 most popular routes to reduce low sample variance.
dates_2010 = df[df['date'] <= '2010-12-31']
dates_2011 = df[df['date'] >= '2011-09-20']

In [229]:
# This gets rid of the new routes that came after December 2010. 
count = 0 
new_routes = []

for i in dates_2011['route'].unique():
    if sum(dates_2010['route'] == i) > 0:
        count += 1
    else:
        new_routes.append(i)
dates_2011 = dates_2011[~dates_2011['route'].isin(new_routes)]
print(count) # this should equal the number of unique routes in dec_2010

489


In [230]:
print(dates_2010['route'].nunique())
print(dates_2011['route'].nunique())

489
489


In [231]:
# Checks to make sure the routes are all aligned properly.
sum(dates_2010.groupby('route')['duration'].mean().index == dates_2011.groupby('route')['duration'].mean().index)

489

In [232]:
plausible_routes = (dates_2011.groupby('route')['duration'].mean() - dates_2010.groupby('route')['duration'].mean()).sort_values()
plausible_routes[:10]
# Negative values indicate a decrease in duration over time. 

route
19th St & Constitution Ave NW L'Enfant Plaza / 7th & C St SW          -2268.856552
14th & G St NW 8th & H St NW                                          -2177.601351
Massachusetts Ave & Dupont Circle NW New York Ave & 15th St NW        -2147.007097
8th & H St NW 14th & G St NW                                          -1723.653595
C & O Canal & Wisconsin Ave NW Georgetown Harbor / 30th St NW         -1509.000562
Columbus Circle / Union Station USDA / 12th & Independence Ave SW     -1226.750831
Massachusetts Ave & Dupont Circle NW Calvert & Biltmore St NW         -1176.433765
L'Enfant Plaza / 7th & C St SW Maine Ave & 7th St SW                  -1172.372348
Georgetown Harbor / 30th St NW Massachusetts Ave & Dupont Circle NW   -1093.083013
Massachusetts Ave & Dupont Circle NW 10th St & Constitution Ave NW    -1029.860368
Name: duration, dtype: float64

In [233]:
top_duration_decrease = plausible_routes[:50].index
top_duration_increase = plausible_routes[-50:].index

In [244]:
# Okay so the dates are asymmetric, should I also condition on the mean being from the same month?
print(dates_2010[dates_2010['route'] == top_duration_decrease[10]]['date'].unique())
dates_2011[dates_2011['route'] == top_duration_decrease[10]]['date'].unique()

['2010-10-08' '2010-10-09' '2010-10-10' '2010-10-11' '2010-10-12'
 '2010-10-16' '2010-10-17' '2010-10-22' '2010-10-23' '2010-11-03'
 '2010-11-12' '2010-11-13' '2010-11-18' '2010-11-20' '2010-11-21'
 '2010-11-26' '2010-11-27' '2010-12-02' '2010-12-11' '2010-12-18'
 '2010-12-19' '2010-12-29']


array(['2011-09-20', '2011-09-21', '2011-09-22', '2011-09-23',
       '2011-09-25', '2011-09-26', '2011-09-27', '2011-09-28',
       '2011-09-29', '2011-09-30', '2011-10-03', '2011-10-04',
       '2011-10-05', '2011-10-06', '2011-10-07', '2011-10-08',
       '2011-10-09', '2011-10-10', '2011-10-11', '2011-10-12',
       '2011-10-15', '2011-10-16', '2011-10-17', '2011-10-18',
       '2011-10-21', '2011-10-22', '2011-10-23', '2011-10-24',
       '2011-10-25', '2011-10-26', '2011-10-27', '2011-10-28',
       '2011-10-29', '2011-10-31', '2011-11-01', '2011-11-02',
       '2011-11-03', '2011-11-04', '2011-11-05', '2011-11-06',
       '2011-11-07', '2011-11-08', '2011-11-09', '2011-11-10',
       '2011-11-11', '2011-11-14', '2011-11-15', '2011-11-16',
       '2011-11-17', '2011-11-18', '2011-11-19', '2011-11-21',
       '2011-11-25', '2011-11-27', '2011-11-28', '2011-11-29',
       '2011-11-30', '2011-12-01', '2011-12-02', '2011-12-03',
       '2011-12-05', '2011-12-08', '2011-12-09', '2011-

In [234]:
print(len(dates_2010[dates_2010['route'] == top_duration_decrease[10]]))
print(len(dates_2011[dates_2011['route'] == top_duration_decrease[10]]))
# So the number in each year is uneven. This may also signify that people are using a bike route more often because it is now faster or more accessible.

53
144


In [235]:
combined = pd.concat((dates_2010,dates_2011))
permuted_durations = np.random.permutation(combined[combined['route'] == top_duration_decrease[0]]['duration'])
np.mean(permuted_durations[29:]) - np.mean(permuted_durations[:29]) # Proof of concept for the function permutation_calc().

273.2267586206899

In [236]:
def permutation_calc(plausible_route_list, M):
    permutation_means = {}
    for route in plausible_route_list:
        permutation_means[route] = [] # Instantiate dictionary.
        num_samples_2010 = len(dates_2010[dates_2010['route'] == route])
        for i in range(M):
            permuted_durations = np.random.permutation(combined[combined['route'] == route]['duration'])
            perm_mean_dif = np.mean(permuted_durations[num_samples_2010:]) - np.mean(permuted_durations[:num_samples_2010]) # Mean duration in 2011 - mean duration in 2010.
            permutation_means[route].append(perm_mean_dif)
    return permutation_means


In [238]:
# permutation_means = permutation_calc(top_duration_decrease, 100)

In [262]:
def t_permutation_calc(plausible_route_list):
    p_values = []
    permutation_test = {}
    for route in plausible_route_list:
        permutation_test[route] = [] # Instantiate dictionary.

        samples_2010 = dates_2010[dates_2010['route'] == route]['duration']
        samples_2011 = dates_2011[dates_2011['route'] == route]['duration']

        t_test = sp.stats.ttest_ind(samples_2010, samples_2011, permutations = 20000)
        permutation_test[route].append(t_test)
        p_values.append(t_test[1])
    return permutation_test, np.array(p_values)


In [266]:
permutation_means_decrease, p_values_d = t_permutation_calc(top_duration_decrease)
permutation_means_increase, p_values_i = t_permutation_calc(top_duration_increase)

In [268]:
print(sum(p_values_d < 0.05))
sum(p_values_i < 0.05)

43


9