In [141]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import datetime
import requests
import os
import zipfile


### Wrangle Data

#### Gather, Assess, then Clean Data before moving on to exploratory analysis

### Gathering

In [142]:
df = pd.read_csv('201810-fordgobike-tripdata.csv')

### Assessing

df.shape

In [144]:
df.sample(5)

Unnamed: 0,duration_sec,start_time,end_time,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_latitude,end_station_longitude,bike_id,user_type,bike_share_for_all_trip
180379,657,2018-10-03 18:43:50.7040,2018-10-03 18:54:47.7430,61.0,Howard St at 8th St,37.776513,-122.411306,364.0,Mission Rock St at 3rd St,37.772886,-122.38994,3803,Subscriber,No
170318,515,2018-10-05 08:20:51.1440,2018-10-05 08:29:26.2530,312.0,San Jose Diridon Station,37.329732,-121.901782,310.0,San Fernando St at 4th St,37.335885,-121.88566,625,Subscriber,No
62668,620,2018-10-22 13:13:29.0600,2018-10-22 13:23:50.0050,70.0,Central Ave at Fell St,37.773311,-122.444293,121.0,Mission Playground,37.75921,-122.421339,3943,Subscriber,No
56389,970,2018-10-23 09:28:40.1100,2018-10-23 09:44:50.1600,285.0,Webster St at O'Farrell St,37.783521,-122.431158,350.0,8th St at Brannan St,37.771431,-122.405787,1667,Subscriber,No
96190,1412,2018-10-16 19:56:52.2030,2018-10-16 20:20:24.2340,206.0,College Ave at Bryant Ave,37.838127,-122.251271,244.0,Shattuck Ave at Hearst Ave,37.873676,-122.268487,877,Subscriber,No


In [145]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201458 entries, 0 to 201457
Data columns (total 14 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   duration_sec             201458 non-null  int64  
 1   start_time               201458 non-null  object 
 2   end_time                 201458 non-null  object 
 3   start_station_id         200102 non-null  float64
 4   start_station_name       200102 non-null  object 
 5   start_station_latitude   201458 non-null  float64
 6   start_station_longitude  201458 non-null  float64
 7   end_station_id           200102 non-null  float64
 8   end_station_name         200102 non-null  object 
 9   end_station_latitude     201458 non-null  float64
 10  end_station_longitude    201458 non-null  float64
 11  bike_id                  201458 non-null  int64  
 12  user_type                201458 non-null  object 
 13  bike_share_for_all_trip  201458 non-null  object 
dtypes: f

Detect 1: start_time and end_time is not in datetime data type.

Detect 2: Ids are ideal to be in integer datatype not float. Therefore Start_station_id, end_station_id need to change the data type.

Detect 3: user_type and bike_share_for_all_trip should be in bool data type.

In [146]:
df.isnull().sum()

duration_sec                  0
start_time                    0
end_time                      0
start_station_id           1356
start_station_name         1356
start_station_latitude        0
start_station_longitude       0
end_station_id             1356
end_station_name           1356
end_station_latitude          0
end_station_longitude         0
bike_id                       0
user_type                     0
bike_share_for_all_trip       0
dtype: int64

Detect 4: Null values detected undert start_station_id, start_station_name, end_station_id, end_station_name. Drop the rows with null values

In [147]:
df.duplicated().sum()

0

In [150]:
df['start_station_id'].duplicated().sum()

201139

In [151]:
df['end_station_id'].duplicated().sum()

201139

In [152]:
df['bike_id'].duplicated().sum()

198545

### Cleaning

1. Change start_time and end_time data type to datetime

In [153]:
df['start_time'] = pd.to_datetime(df['start_time'])
df['end_time'] = pd.to_datetime(df['end_time'])

2. Change data type for Start_station_id, end_station_id, bike_id to int.

In [None]:
df['start_station_id']=df['start_station_id'].astype('int')
df['end_station_id']=df['end_station_id'].astype('int')
df['bike_id'] = pd.to_numeric(df['bike_id'])

3. user_type and bike_share_for_all_trip should be in category and bool data type.

In [None]:
df['user_type'] = df['user_type'].astype('category')
df['bike_share_for_all_trip'] = df['bike_share_for_all_trip'].astype('category')

In [155]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201458 entries, 0 to 201457
Data columns (total 14 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   duration_sec             201458 non-null  int64         
 1   start_time               201458 non-null  datetime64[ns]
 2   end_time                 201458 non-null  datetime64[ns]
 3   start_station_id         200102 non-null  float64       
 4   start_station_name       200102 non-null  object        
 5   start_station_latitude   201458 non-null  float64       
 6   start_station_longitude  201458 non-null  float64       
 7   end_station_id           200102 non-null  float64       
 8   end_station_name         200102 non-null  object        
 9   end_station_latitude     201458 non-null  float64       
 10  end_station_longitude    201458 non-null  float64       
 11  bike_id                  201458 non-null  int64         
 12  user_type       

4. Drop the rows with null values under start_station_id, start_station_name, end_station_id, end_station_name.

In [158]:
df = df.dropna(how='any',axis=0) 

In [159]:
df.isnull().sum()

duration_sec               0
start_time                 0
end_time                   0
start_station_id           0
start_station_name         0
start_station_latitude     0
start_station_longitude    0
end_station_id             0
end_station_name           0
end_station_latitude       0
end_station_longitude      0
bike_id                    0
user_type                  0
bike_share_for_all_trip    0
dtype: int64

In [160]:
df.shape

(200102, 14)

### What is the structure of your dataset? 
### What is/are the main feature(s) of interest in your dataset? 
### What features in the dataset do you think will help support your investigation into your feature(s) of interest? 