In [5]:
import pandas as pd
import numpy as np
from datetime import datetime

## Reading Data
Extracting the data and loading it as dataframes for further analysis.

In [3]:
!unzip data/uber.zip

Archive:  data/uber.zip
  inflating: Cab-Weather Data/Cab-Weather Data/cab_rides.txt  
  inflating: Cab-Weather Data/Cab-Weather Data/weather.txt  
  inflating: cab-weather data/Cab-Weather Data/cab_rides.txt  
  inflating: cab-weather data/Cab-Weather Data/weather.txt  
  inflating: cab_rides.csv           
  inflating: weather.csv             


The data is read as two separate dataframes, one for rides and other for weather.

In [6]:
ride_df = pd.read_csv('cab_rides.csv')
weather_df = pd.read_csv('weather.csv')

In [7]:
ride_df.shape

(693071, 10)

In [8]:
weather_df.shape

(6276, 8)

In [9]:
ride_df.head()

Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,name
0,0.44,Lyft,1544952607890,North Station,Haymarket Square,5.0,1.0,424553bb-7174-41ea-aeb4-fe06d4f4b9d7,lyft_line,Shared
1,0.44,Lyft,1543284023677,North Station,Haymarket Square,11.0,1.0,4bd23055-6827-41c6-b23b-3c491f24e74d,lyft_premier,Lux
2,0.44,Lyft,1543366822198,North Station,Haymarket Square,7.0,1.0,981a3613-77af-4620-a42a-0c0866077d1e,lyft,Lyft
3,0.44,Lyft,1543553582749,North Station,Haymarket Square,26.0,1.0,c2d88af2-d278-4bfd-a8d0-29ca77cc5512,lyft_luxsuv,Lux Black XL
4,0.44,Lyft,1543463360223,North Station,Haymarket Square,9.0,1.0,e0126e1f-8ca9-4f2e-82b3-50505a09db9a,lyft_plus,Lyft XL


In [10]:
weather_df.head()

Unnamed: 0,temp,location,clouds,pressure,rain,time_stamp,humidity,wind
0,42.42,Back Bay,1.0,1012.14,0.1228,1545003901,0.77,11.25
1,42.43,Beacon Hill,1.0,1012.15,0.1846,1545003901,0.76,11.32
2,42.5,Boston University,1.0,1012.15,0.1089,1545003901,0.76,11.07
3,42.11,Fenway,1.0,1012.13,0.0969,1545003901,0.77,11.09
4,43.13,Financial District,1.0,1012.14,0.1786,1545003901,0.75,11.49


Extracting the locations destinations and source and creating a map to encode them in the dataframes.

In [11]:
uniqueLocation = weather_df.location.unique()
uniqueDestination = ride_df.destination.unique()
uniqueSource = ride_df.source.unique()
uniqueLocation.sort()
uniqueDestination.sort()
uniqueSource.sort()

Updating the time in the ride dataframe to ms

In [12]:
ride_df['time_stamp_ms'] = ride_df['time_stamp'] // 1000 

Creating functions to extract the hour, day, month, and year to add them in the dataframe. This can be used as additional features to analyze their impacts on the price. 

In [14]:
def getHour(ts):
  return pd.Timestamp(ts, unit='s', tz='US/Pacific').hour

def weekday(ts):
  return pd.Timestamp(ts, unit='s', tz='US/Pacific').day_of_week

def month(ts):
  return pd.Timestamp(ts, unit='s', tz='US/Pacific').month

def year(ts):
  return pd.Timestamp(ts, unit='s', tz='US/Pacific').year

def day(ts):
  return pd.Timestamp(ts, unit='s', tz='US/Pacific').day

In [15]:
ride_df['hour'] = ride_df['time_stamp_ms'].apply(lambda x: getHour(x),1)
ride_df['day_of_week'] = ride_df['time_stamp_ms'].apply(lambda x: weekday(x),1)

ride_df['day'] = ride_df['time_stamp_ms'].apply(lambda x: day(x),1)
ride_df['month'] = ride_df['time_stamp_ms'].apply(lambda x: month(x),1)
ride_df['year'] = ride_df['time_stamp_ms'].apply(lambda x: year(x),1)

weather_df['hour'] = weather_df['time_stamp'].apply(lambda x: getHour(x),1)
weather_df['day'] = weather_df['time_stamp'].apply(lambda x: day(x),1)
weather_df['month'] = weather_df['time_stamp'].apply(lambda x: month(x),1)
weather_df['year'] = weather_df['time_stamp'].apply(lambda x: year(x),1)

Also, spliting the 24 hour clock into buckets of 3hrs 

In [16]:
def getTimeOfDay(hr):
  if hr >= 0  and hr < 3:
    return 0
  elif hr >= 3 and hr < 6:
    return 1
  elif hr >= 6 and hr < 9:
    return 2
  elif hr >= 9  and hr < 12:
    return 3
  elif hr >= 12 and hr < 15:
    return 4
  elif hr >= 15 and hr < 18:
    return 5
  elif hr >= 18 and hr < 21:
    return 6
  else:
    return 7

In [17]:
ride_df['time_of_day'] = ride_df.apply(lambda x:getTimeOfDay(x['hour']),1)
weather_df['time_of_day'] = weather_df.apply(lambda x:getTimeOfDay(x['hour']),1)

In [18]:
ride_df.head()

Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,name,time_stamp_ms,hour,day_of_week,day,month,year,time_of_day
0,0.44,Lyft,1544952607890,North Station,Haymarket Square,5.0,1.0,424553bb-7174-41ea-aeb4-fe06d4f4b9d7,lyft_line,Shared,1544952607,1,6,16,12,2018,0
1,0.44,Lyft,1543284023677,North Station,Haymarket Square,11.0,1.0,4bd23055-6827-41c6-b23b-3c491f24e74d,lyft_premier,Lux,1543284023,18,0,26,11,2018,6
2,0.44,Lyft,1543366822198,North Station,Haymarket Square,7.0,1.0,981a3613-77af-4620-a42a-0c0866077d1e,lyft,Lyft,1543366822,17,1,27,11,2018,5
3,0.44,Lyft,1543553582749,North Station,Haymarket Square,26.0,1.0,c2d88af2-d278-4bfd-a8d0-29ca77cc5512,lyft_luxsuv,Lux Black XL,1543553582,20,3,29,11,2018,6
4,0.44,Lyft,1543463360223,North Station,Haymarket Square,9.0,1.0,e0126e1f-8ca9-4f2e-82b3-50505a09db9a,lyft_plus,Lyft XL,1543463360,19,2,28,11,2018,6


In [19]:
weather_df.head()

Unnamed: 0,temp,location,clouds,pressure,rain,time_stamp,humidity,wind,hour,day,month,year,time_of_day
0,42.42,Back Bay,1.0,1012.14,0.1228,1545003901,0.77,11.25,15,16,12,2018,5
1,42.43,Beacon Hill,1.0,1012.15,0.1846,1545003901,0.76,11.32,15,16,12,2018,5
2,42.5,Boston University,1.0,1012.15,0.1089,1545003901,0.76,11.07,15,16,12,2018,5
3,42.11,Fenway,1.0,1012.13,0.0969,1545003901,0.77,11.09,15,16,12,2018,5
4,43.13,Financial District,1.0,1012.14,0.1786,1545003901,0.75,11.49,15,16,12,2018,5


Geting source weather and adding them in the ride dataframe.

In [21]:
## geting source weather
# temp	location	clouds	pressure	rain	time_stamp	humidity	wind
def getWeather(location, hour, day, month, year):
  temp = weather_df.loc[(weather_df['location'] == location) & (weather_df['hour'] == hour) & (weather_df['day'] == day) & (weather_df['month'] == month) & (weather_df['year'] == year)]

In [22]:
weather_df['source'] = weather_df['location']

In [23]:
ride_df['dict'] = ride_df['source'] + '_' + ride_df['day'].astype(str) + '_' + ride_df['month'].astype(str) + '_' + ride_df['year'].astype(str) + '_' + ride_df['hour'].astype(str)

In [24]:
weather_df['dict'] = weather_df['source'] + '_' + weather_df['day'].astype(str) + '_' + weather_df['month'].astype(str) + '_' + weather_df['year'].astype(str) + '_' + weather_df['hour'].astype(str)

In [25]:
temp = weather_df.set_index('dict').to_dict()['temp']
clouds = weather_df.set_index('dict').to_dict()['clouds']
pressure = weather_df.set_index('dict').to_dict()['pressure']
rain = weather_df.set_index('dict').to_dict()['rain']
humidity = weather_df.set_index('dict').to_dict()['humidity']
wind = weather_df.set_index('dict').to_dict()['wind']

In [26]:
ride_df['temp'] = ride_df['dict'].map(temp)
ride_df['clouds'] = ride_df['dict'].map(clouds)
ride_df['pressure'] = ride_df['dict'].map(pressure)
ride_df['rain'] = ride_df['dict'].map(rain)
ride_df['wind'] = ride_df['dict'].map(wind)

In [27]:
ride_df.loc[(ride_df['day'] == 16) & (ride_df['source'] == 'Back Bay') & (ride_df['hour'] == 15) & (ride_df['year'] == 2018) & (ride_df['month'] == 12)].head()

Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,name,...,day,month,year,time_of_day,dict,temp,clouds,pressure,rain,wind
5027,2.34,Lyft,1545002408758,Haymarket Square,Back Bay,30.0,1.0,afba18aa-ce01-4006-8bab-d03d2dc29355,lyft_luxsuv,Lux Black XL,...,16,12,2018,5,Back Bay_16_12_2018_15,42.42,1.0,1012.14,0.1228,11.25
8372,2.33,Uber,1545001809801,North End,Back Bay,9.5,1.0,89c3f652-14e6-46aa-b59d-6e3a8ddedc50,55c66225-fbe7-4fd5-9072-eab1ece5e23e,UberX,...,16,12,2018,5,Back Bay_16_12_2018_15,42.42,1.0,1012.14,0.1228,11.25
16092,6.26,Uber,1545001203826,South Station,Back Bay,43.0,1.0,d5726059-64c1-4716-9b14-5bd46677be7a,6d318bcc-22a3-4af6-bddd-b409bfce1546,Black SUV,...,16,12,2018,5,Back Bay_16_12_2018_15,42.42,1.0,1012.14,0.1228,11.25
21019,2.32,Uber,1545003304083,Haymarket Square,Back Bay,9.5,1.0,9a5156fd-7ce1-4bd9-8457-81f27066d23d,55c66225-fbe7-4fd5-9072-eab1ece5e23e,UberX,...,16,12,2018,5,Back Bay_16_12_2018_15,42.42,1.0,1012.14,0.1228,11.25
23612,2.32,Uber,1545004511239,Haymarket Square,Back Bay,9.5,1.0,3d4665dc-3404-492c-b64b-42d235885a74,9a0e7b09-b92b-4c41-9779-2ad22b4d779d,WAV,...,16,12,2018,5,Back Bay_16_12_2018_15,42.42,1.0,1012.14,0.1228,11.25


Creating a dictionary of non numeric features to encode them in the dataframe

In [28]:
def genDict(arr):
  result = {}
  count = 0
  for i in arr:
    result[i] = count
    count += 1
  return result

In [29]:
name_dict = genDict(ride_df['name'].unique())
cab_type_dict = genDict(ride_df['cab_type'].unique())
source_dict = genDict(ride_df['source'].unique())
destination_dict = genDict(ride_df['destination'].unique())

In [30]:
ride_df['name_dict'] = ride_df['name'].map(name_dict)
ride_df['cab_type_dict'] = ride_df['cab_type'].map(cab_type_dict)
ride_df['source_dict'] = ride_df['source'].map(source_dict)
ride_df['destination_dict'] = ride_df['destination'].map(destination_dict)

In [31]:
ride_df.head()

Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,name,...,dict,temp,clouds,pressure,rain,wind,name_dict,cab_type_dict,source_dict,destination_dict
0,0.44,Lyft,1544952607890,North Station,Haymarket Square,5.0,1.0,424553bb-7174-41ea-aeb4-fe06d4f4b9d7,lyft_line,Shared,...,Haymarket Square_16_12_2018_1,38.46,0.29,1022.25,,7.68,0,0,0,0
1,0.44,Lyft,1543284023677,North Station,Haymarket Square,11.0,1.0,4bd23055-6827-41c6-b23b-3c491f24e74d,lyft_premier,Lux,...,Haymarket Square_26_11_2018_18,43.82,0.99,1002.59,0.0997,11.57,1,0,0,0
2,0.44,Lyft,1543366822198,North Station,Haymarket Square,7.0,1.0,981a3613-77af-4620-a42a-0c0866077d1e,lyft,Lyft,...,Haymarket Square_27_11_2018_17,,,,,,2,0,0,0
3,0.44,Lyft,1543553582749,North Station,Haymarket Square,26.0,1.0,c2d88af2-d278-4bfd-a8d0-29ca77cc5512,lyft_luxsuv,Lux Black XL,...,Haymarket Square_29_11_2018_20,35.08,0.0,1013.71,,5.25,3,0,0,0
4,0.44,Lyft,1543463360223,North Station,Haymarket Square,9.0,1.0,e0126e1f-8ca9-4f2e-82b3-50505a09db9a,lyft_plus,Lyft XL,...,Haymarket Square_28_11_2018_19,37.66,0.41,998.42,,11.16,4,0,0,0


Removing all the redundant features

In [32]:
df = ride_df[['distance', 'destination_dict', 'source_dict',
       'hour',  'day_of_week', 'day', 'month', 'year', 'time_of_day', 'temp',
       'clouds', 'pressure', 'rain', 'wind', 'name_dict', 'cab_type_dict','price']]

In [33]:
df.head()

Unnamed: 0,distance,destination_dict,source_dict,hour,day_of_week,day,month,year,time_of_day,temp,clouds,pressure,rain,wind,name_dict,cab_type_dict,price
0,0.44,0,0,1,6,16,12,2018,0,38.46,0.29,1022.25,,7.68,0,0,5.0
1,0.44,0,0,18,0,26,11,2018,6,43.82,0.99,1002.59,0.0997,11.57,1,0,11.0
2,0.44,0,0,17,1,27,11,2018,5,,,,,,2,0,7.0
3,0.44,0,0,20,3,29,11,2018,6,35.08,0.0,1013.71,,5.25,3,0,26.0
4,0.44,0,0,19,2,28,11,2018,6,37.66,0.41,998.42,,11.16,4,0,9.0


Dropping if a feature value is NA for the initial analysis

In [34]:
df.dropna(inplace=True)
X = df.drop(columns=['price'])
y = df['price']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Saving the data frame as a csv file

In [35]:
df.to_csv('cleanData.csv',index=False)