<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#IMPORTS" data-toc-modified-id="IMPORTS-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>IMPORTS</a></span></li><li><span><a href="#LOAD" data-toc-modified-id="LOAD-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>LOAD</a></span></li><li><span><a href="#SELECT-LINE-145" data-toc-modified-id="SELECT-LINE-145-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>SELECT LINE 145</a></span></li><li><span><a href="#REMOVE-PROBLEMATIC-ROWS" data-toc-modified-id="REMOVE-PROBLEMATIC-ROWS-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>REMOVE PROBLEMATIC ROWS</a></span></li><li><span><a href="#CREATE-A-LIST-OF-THE-VALID-TRIP-IDS-W/-WHICH-TO-FILTER-LEAVETIMES" data-toc-modified-id="CREATE-A-LIST-OF-THE-VALID-TRIP-IDS-W/-WHICH-TO-FILTER-LEAVETIMES-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>CREATE A LIST OF THE VALID TRIP IDS W/ WHICH TO FILTER LEAVETIMES</a></span></li><li><span><a href="#TO-FEATHER:-DF145-TABLE-&amp;-DF145-IDS" data-toc-modified-id="TO-FEATHER:-DF145-TABLE-&amp;-DF145-IDS-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>TO FEATHER: DF145 TABLE &amp; DF145 IDS</a></span></li></ul></div>

# IMPORTS

In [1]:
from IPython.core.display import display, HTML
display(HTML('<style>.container {width:90% !important;}</style>'))

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import csv

In [3]:
# Display plots in the notebook
%matplotlib inline

In [4]:
def assess_df(df):
  
    # declare the feature types in the dataframe.
    # this adds dynamism to the function, whereby it can check
    # for constant columns on a dtype-amended or non-amended
    # dataframe.
    # Applicable for constant column check below

    feature_types = {df[df.columns[x]].dtypes.name for x in range(len(df.columns))}
    
  	# print datatypes
  
    print("Datatypes:\n")
    print(df.dtypes)
    print('\n\n')
    
    # print number rows and cols
    print('*' * 50)
    print('ROWS, COLUMNS, NULLS\n')
    print(df.shape[0], "rows in the dataframe.")
    print(df.shape[1], "columns in the dataframe.\n")
    print(df.isnull().sum().sum(), "null values in the dataframe.")
    
    # check duplicate rows & col; print if present
    
    duplicate_rows = df.duplicated(keep=False).sum()
    duplicate_cols = df.index.T.duplicated(keep=False).sum()
    
    print(duplicate_rows, "duplicate rows")
    print(duplicate_cols, "duplicate columns")
    
    if duplicate_rows > 0 or duplicate_cols > 0:
      print(df.loc[df.duplicated()])
	
    print('\n')
    
    
    # check for constant columns
    # form lists of numeric and categorical columns
    
    numeric_cols = list(df.select_dtypes(include=[np.number]).columns.values)
    categorical_cols = list(df.select_dtypes('category').columns.values)
    
    # generate list of standard deviations for each numeric feature
    # a standard deviation of 0 indicates a constant numeric column.
    
    standard_diffs = [df.describe().loc['std', x] for x in numeric_cols]
    
    
    # for each categorical column, check whether the feature has
    # a unique value of 1. if yes, indicates a constant column
    
    constant_categorical_column=False
    if 'category' in feature_types:
        for column in categorical_cols:
            if df[column].describe().index.unique == 1:
                constant_categorical_column = True
    
    # print the results of checking for constant columns
    # for both continuous and categorical features
            
    if 0 in standard_diffs:
        print("Constant numeric columns: TRUE")
    else:
        print("Constant numeric columns: FALSE")
  
    if 'category' in feature_types:    
        if constant_categorical_column == True:
            print("Constant categorical columns: TRUE")
        else:
            print("Constant categorical columns: FALSE")

            
    
    print('\n\n')

    
    # feature stats
    print('*' * 50)
    print('DESCRIPTION\n')
    print(df.describe().T)
    print('\n\n')
    print('*' * 50)

    
    # feature stats: categorical
    
    if 'category' in feature_types:
        print('CATEGORICAL DESCRIPTION\n')
        print('\n')
        print(df.select_dtypes(['category']).describe().T)
        print('\n\n')
        print('*' * 50)
    
    # print feature cardinalities
    
    print("FEATURE CARDINALITIES\n")
    column_names = list(df.columns.values)

    print('{0:45}  {1}'.format("Feature", "Distinct Values"))
    print('{0:45}  {1}'.format("-------", "--------------- \n"))

    for c in column_names:
        print('{0:45}  {1}'.format(c, str(len(df[c].unique()))))
    
    print('\n')
    print('*' * 50)
    print('MEMORY\n')
    
    
    print(df.info(memory_usage='deep'))
    print('\n')
    print(df.memory_usage(deep=True))
    print('\n')
    print('*' * 50)
    print('HEAD\n')
    print(df.head(10))
    print('\n')
    print('*' * 50)
    print('TAIL\n')
    print(df.tail(10))

# LOAD

In [2]:
df = pd.read_feather('/tmp/ssh_mount/data/dataframes/230719_trips.feather')

In [16]:
df.loc[df.LINEID == '161'].isna().sum()

DAYOFSERVICE               0
TRIPID                     0
LINEID                     0
ROUTEID                    0
DIRECTION                  0
PLANNEDTIME_ARR            0
PLANNEDTIME_DEP            0
ACTUALTIME_ARR           819
ACTUALTIME_DEP           704
ACTUAL_TRIP_DURATION    1476
DAYOFWEEK                  0
MONTH                      0
HOURRANGE_DEPARTURE      704
dtype: int64

In [18]:
df.loc[df.LINEID == '161'].shape

(1903, 13)

In [19]:
df.loc[df.LINEID == '161']

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,ACTUAL_TRIP_DURATION,DAYOFWEEK,MONTH,HOURRANGE_DEPARTURE
654,2018-06-21,7014925,161,161_50,1,41220,39600,,39720.0,,3,6,11:00-12:00
4671,2018-06-26,7110473,161,161_50,1,37620,36000,,36000.0,,1,6,09:00-10:00
10955,2018-07-09,7163630,161,161_50,1,41220,39600,,39552.0,,0,7,10:00-11:00
10956,2018-07-09,7163631,161,161_50,1,55620,54000,,53987.0,,0,7,14:00-15:00
10985,2018-07-09,7163629,161,161_51,2,39600,37800,38900.0,,,0,7,
14190,2018-09-20,8084892,161,161_51,2,40385,39000,,,,3,9,
14191,2018-09-20,8085819,161,161_51,2,39185,37800,39222.0,,,3,9,
14192,2018-09-20,8091095,161,161_51,2,64085,62700,64004.0,,,3,9,
14193,2018-09-20,8091093,161,161_51,2,59585,58200,60238.0,59432.0,806.0,3,9,16:00-17:00
15050,2018-07-09,7165303,161,161_50,1,37620,36000,,35971.0,,0,7,09:00-10:00


In [6]:
df.head(2)

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,ACTUAL_TRIP_DURATION,DAYOFWEEK,MONTH,HOURRANGE_DEPARTURE
0,2018-02-07,6253783,68,68_80,1,87245,84600,87524.0,84600.0,2924.0,2,2,23:00-00:00
1,2018-02-07,6262138,25B,25B_271,2,30517,26460,32752.0,,,2,2,


# SELECT LINE 145

In [7]:
df_145_trips = df[df.LINEID == '145']

In [8]:
df_145_trips.reset_index(drop=True, inplace=True)

In [9]:
df_145_trips.head(2)

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,ACTUAL_TRIP_DURATION,DAYOFWEEK,MONTH,HOURRANGE_DEPARTURE
0,2018-02-18,6263611,145,145_102,1,57297,52800,57733.0,52807.0,4926.0,6,2,14:00-15:00
1,2018-02-18,6267386,145,145_102,1,58497,54000,58711.0,53932.0,4779.0,6,2,14:00-15:00


# REMOVE PROBLEMATIC ROWS

<br>

The following changes are based on work done in the *First_145ABT_Preparation_DataQuality_(Informal).ipynb* notebook

In [10]:
# remove null values

df_145_trips.dropna(axis=0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [11]:
# remove obvious errors whereby departure time is later than arrival time for a trip

df_145_trips.drop(df_145_trips[df_145_trips.ACTUALTIME_DEP \
                           > df_145_trips.ACTUALTIME_ARR].index, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [12]:
# remove low range outliers, based on trip duration
# (trips under 45min)

df_145_trips.drop(df_145_trips[df_145_trips.ACTUAL_TRIP_DURATION\
                               < 2750].index, inplace=True)

In [13]:
# remove high range outliers, based on trip duration
# (trips over 2.3hrs)

df_145_trips.drop(df_145_trips[df_145_trips.ACTUAL_TRIP_DURATION > 8300]\
                  .index, inplace=True)

# CREATE A LIST OF THE VALID TRIP IDS W/ WHICH TO FILTER LEAVETIMES

In [14]:
df_145_trips.head(2)

Unnamed: 0,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,ACTUAL_TRIP_DURATION,DAYOFWEEK,MONTH,HOURRANGE_DEPARTURE
0,2018-02-18,6263611,145,145_102,1,57297,52800,57733.0,52807.0,4926.0,6,2,14:00-15:00
1,2018-02-18,6267386,145,145_102,1,58497,54000,58711.0,53932.0,4779.0,6,2,14:00-15:00


In [15]:
df_145_trips_ids = df_145_trips.TRIPID.drop_duplicates()

In [16]:
type(df_145_trips_ids)

pandas.core.series.Series

In [17]:
df_145_trips_ids.dtypes

CategoricalDtype(categories=[5955221, 5955222, 5955223, 5955224, 5955225, 5955226,
                  5955227, 5955228, 5955229, 5955230,
                  ...
                  8592186, 8592187, 8592188, 8592189, 8592202, 8592203,
                  8592204, 8592205, 8592206, 8592207],
                 ordered=False)

In [18]:
df_145_trips_ids = df_145_trips_ids.astype('int32')

In [19]:
df_145_trips_ids.dtype

dtype('int32')

# TO FEATHER: DF145 TABLE & DF145 IDS

In [21]:
# write LINE 145's trip indices to csv

df_145_trips_ids.to_csv('/Users/davidodwyer/Documents/studyCS/Semester_3/data/dataframes/L145/L145_indices.csv')

  This is separate from the ipykernel package so we can avoid doing imports until
