# Clean 6 Months Data
### Green Taxi Datasets

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from datetime import datetime
from math import radians, cos, sin, asin, sqrt
import requests
import json
import time

#### 1. First, we will import the data in the system

In [None]:
start = time.time()
green0 = pd.read_csv('green_tripdata_2015-01.csv')
green1 = pd.read_csv('green_tripdata_2015-02.csv')
green2 = pd.read_csv('green_tripdata_2015-03.csv')
green3 = pd.read_csv('green_tripdata_2015-04.csv')
green4 = pd.read_csv('green_tripdata_2015-05.csv')
green5 = pd.read_csv('green_tripdata_2015-06.csv')


green = pd.concat([green0,green1,green2,green3,green4,green5])
# Here we are storing all the column names in a array named 'columns'
columns = green.columns
print(columns)
end = time.time()
print('Time Taken to process: ' ,end-start)

Index(['VendorID', 'lpep_pickup_datetime', 'Lpep_dropoff_datetime',
       'Store_and_fwd_flag', 'RateCodeID', 'Pickup_longitude',
       'Pickup_latitude', 'Dropoff_longitude', 'Dropoff_latitude',
       'Passenger_count', 'Trip_distance', 'Fare_amount', 'Extra', 'MTA_tax',
       'Tip_amount', 'Tolls_amount', 'Ehail_fee', 'improvement_surcharge',
       'Total_amount', 'Payment_type', 'Trip_type '],
      dtype='object')
Time Taken to process:  12.47181749343872


#### 2. Now, we will get the summary of the datasets and then we will clean up the dataset

In [None]:
green.insert(0,'trip_id',range(len(green)))

Time Taken to process:  0.7889082431793213


In [None]:
# Getting the values which needs to be cleaned up before procedding further
start = time.time()
print("\u0332".join('Number of Null data values in each columns:'))
print(green.isnull().sum())
print('')
print("\u0332".join('Number of Datasets:'), len(green.index))

N̲u̲m̲b̲e̲r̲ ̲o̲f̲ ̲N̲u̲l̲l̲ ̲d̲a̲t̲a̲ ̲v̲a̲l̲u̲e̲s̲ ̲i̲n̲ ̲e̲a̲c̲h̲ ̲c̲o̲l̲u̲m̲n̲s̲:
trip_id                        0
VendorID                       0
lpep_pickup_datetime           0
Lpep_dropoff_datetime          0
Store_and_fwd_flag             0
RateCodeID                     0
Pickup_longitude               0
Pickup_latitude                0
Dropoff_longitude              0
Dropoff_latitude               0
Passenger_count                0
Trip_distance                  0
Fare_amount                    0
Extra                          0
MTA_tax                        0
Tip_amount                     0
Tolls_amount                   0
Ehail_fee                6291450
improvement_surcharge          0
Total_amount                   0
Payment_type                   0
Trip_type                     10
dtype: int64

N̲u̲m̲b̲e̲r̲ ̲o̲f̲ ̲D̲a̲t̲a̲s̲e̲t̲s̲: 6291450


In [None]:
# Since the number of null values in the 'Ehail_fee' column is equal to the number of rows in the dataset
# Therefore, we will drop the whole column due to its irrelevance.

green = green.drop(columns=['Ehail_fee'])
print("\u0332".join('Number of Null data values in each columns:'))
print(green.isnull().sum())

N̲u̲m̲b̲e̲r̲ ̲o̲f̲ ̲N̲u̲l̲l̲ ̲d̲a̲t̲a̲ ̲v̲a̲l̲u̲e̲s̲ ̲i̲n̲ ̲e̲a̲c̲h̲ ̲c̲o̲l̲u̲m̲n̲s̲:
trip_id                   0
VendorID                  0
lpep_pickup_datetime      0
Lpep_dropoff_datetime     0
Store_and_fwd_flag        0
RateCodeID                0
Pickup_longitude          0
Pickup_latitude           0
Dropoff_longitude         0
Dropoff_latitude          0
Passenger_count           0
Trip_distance             0
Fare_amount               0
Extra                     0
MTA_tax                   0
Tip_amount                0
Tolls_amount              0
improvement_surcharge     0
Total_amount              0
Payment_type              0
Trip_type                10
dtype: int64


In [None]:
# In the boxplot, we can see that '2.0' is an outlier, therefore majority of the dataset have value '1'
# So, we will replace the null values with '1'
green['Trip_type '] = green['Trip_type '].fillna(1)

In [None]:
# Now there are no null values in the dataset
# Update the columns variable according to the new cleaned up value
columns = green.columns
print("\u0332".join('Number of Null data values in each columns:'))
print(green.isnull().sum())

N̲u̲m̲b̲e̲r̲ ̲o̲f̲ ̲N̲u̲l̲l̲ ̲d̲a̲t̲a̲ ̲v̲a̲l̲u̲e̲s̲ ̲i̲n̲ ̲e̲a̲c̲h̲ ̲c̲o̲l̲u̲m̲n̲s̲:
trip_id                  0
VendorID                 0
lpep_pickup_datetime     0
Lpep_dropoff_datetime    0
Store_and_fwd_flag       0
RateCodeID               0
Pickup_longitude         0
Pickup_latitude          0
Dropoff_longitude        0
Dropoff_latitude         0
Passenger_count          0
Trip_distance            0
Fare_amount              0
Extra                    0
MTA_tax                  0
Tip_amount               0
Tolls_amount             0
improvement_surcharge    0
Total_amount             0
Payment_type             0
Trip_type                0
dtype: int64


#### 3. Now,filter out the data which we need to use for our algorithms

In [None]:
dataset = green[['trip_id','VendorID', 'lpep_pickup_datetime', 'Lpep_dropoff_datetime', 'Pickup_longitude', 'Pickup_latitude','Dropoff_longitude','Dropoff_latitude', 'Passenger_count', 'Trip_distance']]
print("\u0332".join('Dataset:'), len(dataset.index))
dataset = dataset[dataset.Passenger_count < 3]
dataset = dataset[dataset.Trip_distance != 0]
print("\u0332".join('Filtered Dataset:'), len(dataset.index))

D̲a̲t̲a̲s̲e̲t̲: 6291450
F̲i̲l̲t̲e̲r̲e̲d̲ ̲D̲a̲t̲a̲s̲e̲t̲: 5664142
Time Taken to process:  14.211508512496948


In [None]:

dataset['lpep_pickup_datetime'] = pd.to_datetime(dataset['lpep_pickup_datetime'])
dataset['Lpep_dropoff_datetime'] = pd.to_datetime(dataset['Lpep_dropoff_datetime'])

4.Output to cleaned data to CSV

In [None]:
dataset.to_csv('green_tripdata_2015_6months_cleaned.csv', index=False)