In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
path0 = os.path.join('..', 'data', 'raw', '2017_Toronto_Bikeshare.csv')
path1 = os.path.join( '..', 'data', 'interim', '1_tbs_with_date_flags.csv')

In [None]:
df0 = pd.read_csv(path0)
df1 = pd.read_csv(path1, index_col = 0)

# checking validity of first transformation
### 1) expect 1 less row
### 2) expect new flags appropriatly placed

In [None]:
#checking number of rows
print('raw shape:', df0.shape)
print('shape after 1:', df1.shape)


In [None]:
# find dropped rows
print(df0.iloc[1492369, :])
print(df0.iloc[1424602, :])

In [None]:
# Check flags
df1.sample(n=10, random_state=1)

In [None]:
df1[~df1['trip_start_timeis_good_date'] | ~df1['trip_stop_timeis_good_date']]

### first python file seems to work as expected

# second stage 

### 1) number of rows does not change
### 2) old columns are dropped
### 3) dates seem to match
### 4) calculated time between datetime should match time in seconds column

In [None]:
path2 = os.path.join('..', 'data', 'interim', '2_tbs_with_datetime.csv')
df2 = pd.read_csv(path2, index_col = 0)

In [None]:
#1 - expect 1492368 rows
print('df2 shape:', df2.shape)

In [None]:
# 2 - expect flag columns to be gone and new columns added for date
df2['trip_start_time'] = pd.to_datetime(df2['trip_start_time'], format='%Y-%m-%d %H:%M:%S')
df2['trip_stop_time'] = pd.to_datetime(df2['trip_stop_time'], format='%Y-%m-%d %H:%M:%S')
df2.dtypes

In [None]:
#3 - visual check of dates
print(df1['trip_start_time'].sample(n=4, random_state=2))
print(df2['trip_start_time'].sample(n=4, random_state=2))

print(df1['trip_stop_time'].sample(n=4, random_state=3))
print(df2['trip_stop_time'].sample(n=4, random_state=3))

In [None]:
#4 - check calculated time compared to entries. if dataframe was off this would fail on trips crossing over days or months
df_rand = df2.sample( n = 100000, random_state = 4 ).copy()
df_rand = df_rand.sort_index(ascending=True)
df_rand['check_time'] = (df_rand['trip_stop_time'] - df_rand['trip_start_time']) / np.timedelta64(1,'s')
df_rand['time_delta'] = (df_rand['check_time'] - df_rand['trip_duration_seconds'])
df_rand['time_flag'] = df_rand['time_delta'] > 500
df_rand['time_flag'].value_counts()

### Second Python File - Date formating seems on point

# third stage


### 1 - expect data types to make sense and no error reading in csv
### 2 - expect number of rows to be constant
### 3 - expect stop numbers to be dropped
### 4 - expect no NaNs
### 5 - expect names to be filed in matching old values

In [None]:
#1, #3
path3 = os.path.join('..', 'data', 'interim', '3_tbs_all_filled.csv')
df3 = pd.read_csv(path3, index_col = 0)
df3['trip_start_time'] = pd.to_datetime(df3['trip_start_time'], format='%Y-%m-%d %H:%M:%S')
df3['trip_stop_time'] = pd.to_datetime(df3['trip_stop_time'], format='%Y-%m-%d %H:%M:%S')
print(df3.dtypes)

In [None]:
#2 expect 1492368
df3.shape[0] == 1492368

In [None]:
#4
df3.isnull().sum().sum() == 0

In [None]:
index =  df2[pd.isnull(df2)['user_type'] == True].iloc[:,0].tolist()
flag = False
for idx in index[0:-1]:
    if str(df3.iloc[idx, 7]) != str(df2.iloc[idx, 7]):
        flag = True
        break
        
        
if flag:
    print('Test Failed')
else:
    print('Test Passed')

In [None]:
df3.tail

In [None]:
df3.tail()

# Finally test monitization

### shape adds up
### The money adds up

In [None]:
path4 = os.path.join('..', 'data', 'processed', 'final.csv')
df4 = pd.read_csv(path4, index_col = 0)
df4['trip_start_time'] = pd.to_datetime(df4['trip_start_time'], format='%Y-%m-%d %H:%M:%S')
df4['trip_stop_time'] = pd.to_datetime(df4['trip_stop_time'], format='%Y-%m-%d %H:%M:%S')

In [None]:
df4.shape[0] == 1492368

In [None]:
df4.dtypes

In [None]:
df4[~df4['overages'] % 4 == 0]

In [None]:
df4[df4['overages'] > 0].sample(n = 10)