In [None]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
files=os.listdir(r'E:\Data Science\Data Analyst Projects\UberNewYorkDataAnalysis')[-7:] 
#files

In [None]:
files.remove('uber-raw-data-janjune-15.csv')
files

In [None]:
path=r'E:\Data Science\Data Analyst Projects\UberNewYorkDataAnalysis'
final=pd.DataFrame()

for f in files:
    df=pd.read_csv(path+"/"+f,encoding='utf') #reading current file at ith location, utf is a popular encoding 
    final=pd.concat([final,df]) #concatinating file
    
final.shape

# Data Preparation

#### Lat : The latitude of the Uber pickup
#### Lon : The longitude of the Uber pickup
#### Base : The TLC base company code affiliated with the Uber pickup

In [None]:
df=final.copy()
df.head(2)

In [None]:
#df.dtypes

In [None]:
df['Date/Time'] = pd.to_datetime(df['Date/Time'], format="%m/%d/%Y %H:%M:%S")
df.dtypes

In [None]:
df['weekday']=df['Date/Time'].dt.day_name()
df['day']=df['Date/Time'].dt.day
df['minute']=df['Date/Time'].dt.minute
df['month']=df['Date/Time'].dt.month
df['hour']=df['Date/Time'].dt.hour

In [None]:
df.head(2)

In [None]:
#df.dtypes

# Analysis of journey by weekdays

In [None]:
#df['weekday'].value_counts()

In [None]:
#df['weekday'].unique()

In [None]:
!pip install plotly
import plotly.express as px

In [None]:
px.bar(x=df['weekday'].value_counts().index, y=df['weekday'].value_counts())

#### Conclusion: Rush is highest on Thursday as more no. of rides are taken on thursday.

# Analysis of journey by hour

In [None]:
#df['hour'].unique()

In [None]:
plt.hist(df['hour'])

In [None]:
#df['month'].unique()

In [None]:
month_name={4:'Apr',8:'Aug',7:'Jul',6:'Jun',5:'May',9:'Sep'}
plt.figure(figsize=(10,10))
for i,month in enumerate(df['month'].unique()):
    plt.subplot(3,2,i+1) 
    plt.xlabel(month_name.get(month)) #get method
    df[df['month']==month]['hour'].hist()

#### Conclusion: Peak time of all months is evening

# Analyze which month has maximum rides

In [None]:
#df.head(2)

In [None]:
!pip install chart-studio
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [None]:
#df.groupby('month')['hour'].count() #finding no. of rides per month

In [None]:
trace1=go.Bar(x=df.groupby('month')['hour'].count().index,
       y=df.groupby('month')['hour'].count(),
       name='Priority')
iplot([trace1])

#### Conclusion: September has the highest rides

# Analysis of Journey by each day of the month

In [None]:
plt.figure(figsize=(5,5))
plt.hist(df['day'],bins=30,rwidth=0.8,range=(0.5,30.5))
plt.xlabel("Date of the month")
plt.ylabel("Total journeys")
plt.title("Journeys by day of the month")

#### Conclusion: 30th of the month has highest rides

In [None]:
sns.distplot(df['day'])

# Analysis of total rides month wise

In [None]:
#df['month'].unique()

In [None]:
plt.figure(figsize=(10,10))

for i,month in enumerate(df['month'].unique(),1): #index starts from 1
    plt.subplot(3,2,i)
    df_out=df[df['month']==month] #whole dataset with filtered column values in month column
    plt.hist(df_out['day'])
    plt.xlabel('days in month {}'.format(month))
    plt.ylabel('total rides')

#### Conclusion: Last days of all months seem to be pretty busy.

# Analysing rush in hour

In [None]:
#df.head(2)

In [None]:
sns.pointplot(x='hour',y='Lat',data=df)

In [None]:
ax=sns.pointplot(x='hour',y='Lat',data=df,hue='weekday')
#splitting df on the basis of weekday through colors
ax.set_title('Hour of the day  vs  Latitude of passenger')

#gives rush at diff hours of diff days of the week with respect to the diff latitudes

# Analysing which base number gets popular by month name

In [None]:
#df.groupby(['Base','month'])['Date/Time'].count()

In [None]:
base=df.groupby(['Base','month'])['Date/Time'].count().reset_index()
base.head(2)

In [None]:
plt.figure(figsize=(7,5))
sns.lineplot(x='month',y='Date/Time',hue='Base',data=base)

#### Conclusion: B02617 (in green color) gets popular by every passing month

# Perform Cross Analysis through heatmap

## Heatmap between hour and weekday

In [None]:
def count_rows(rows):
    return len(rows)

In [None]:
by_hrnday=df.groupby(['weekday','hour']).apply(count_rows)
#by_hrnday

In [None]:
pivot=by_hrnday.unstack()
pivot.head(2)

In [None]:
plt.figure(figsize=(7,5))
sns.heatmap(pivot)

#### Conclusion: On all days, rush is in evening

## Heatmap between hour and day(date)

In [None]:
#common function for all heatmaps

def heatmap(col1,col2):
    by_cross=df.groupby([col1,col2]).apply(count_rows)
    pivot=by_cross.unstack()
    plt.figure(figsize=(7,5))
    return sns.heatmap(pivot) 

In [None]:
heatmap('day','hour')

#### Conclusion: On all days, there is a rush from evening till night while there is no rush from midnight to early morning

## Heatmap between weekday and month

In [None]:
heatmap('weekday','month')

## Heatmap between day and month

In [None]:
heatmap('day','month')

# Perform spatial analysis on demand of Ubers

## Analysis of location data points

In [None]:
df.head(2)

In [None]:
plt.figure(figsize=(5,5))

plt.plot(df['Lon'],df['Lat'],'r+', ms=0.5)
plt.xlim(-74.2, -73.7)
plt.ylim(40.6,41)

#### Conclusion: Higher density of red color shows high rush. Midtown Manhattan sees most rush followed by lower Manhattan and Brooklyn

## Perform spatial analysis using heatmap to get clear cut understanding of Rush

In [None]:
#analysis on weekends
options=['Sunday','Saturday']
df_out=df[df['weekday'].isin(options)]
df_out.shape

In [None]:
df_out.head(2)

In [None]:
df_out.groupby(['Lat','Lon'])['weekday'].count().head(2)

In [None]:
rush=df_out.groupby(['Lat','Lon'])['weekday'].count().reset_index() #creating new df
rush.columns=['Lat','Lon','No. of trips']
rush.head(2)

In [None]:
!pip install folium
from folium.plugins import HeatMap
import folium

In [None]:
basemap=folium.Map()
basemap

In [None]:
HeatMap(rush,zoom=20,radius=15).add_to(basemap)
basemap

#### Conclusion: Red colored area represents area with highest no. of rides and rush which is Manhattan

## Automating the analysis

In [None]:
def plot(df,day):
    basemap=folium.Map()
    df_out=df[df['weekday']==day]
    HeatMap(df_out.groupby(['Lat','Lon'])['weekday'].count().reset_index(),zoom=20,radius=15).add_to(basemap)
    return basemap

In [None]:
plot(df,'Tuesday')

# Analysing Uber Pickups in each month

## Data Preparation

In [None]:
uber_15=pd.read_csv(r'E:\Data Science\Data Analyst Projects\UberNewYorkDataAnalysis/uber-raw-data-janjune-15.csv')
uber_15.head(2)

In [None]:
uber_15.dtypes

In [None]:
uber_15['Pickup_date'] = pd.to_datetime(uber_15['Pickup_date'], format="%Y-%m-%d %H:%M:%S")
uber_15.dtypes

In [None]:
uber_15['weekday']=uber_15['Pickup_date'].dt.day_name()
uber_15['day']=uber_15['Pickup_date'].dt.day
uber_15['month']=uber_15['Pickup_date'].dt.month
uber_15['minute']=uber_15['Pickup_date'].dt.minute
uber_15['hour']=uber_15['Pickup_date'].dt.hour

In [None]:
uber_15.head(2)

## Monthly Uber pickups in NYC

In [None]:
uber_15['month'].value_counts()

In [None]:
px.bar(x=uber_15['month'].value_counts().index,
       y=uber_15['month'].value_counts())

#### Conclusion: Number of pickups are continuously increasing throughout the first 6 months of 2015. Number of pickups is highest in June.

# Analysing rush in New York City

## Analysing rush in every hour in NYC

In [None]:
plt.figure(figsize=(6,5))
sns.countplot(uber_15['hour'])

#### Conclusion: There is a dip in morning pickups, and after that the number of pickups are significantly increasing throughout the day and there is a peak in the evening

## In-depth analysis of rush in NYC, day and hour wise

In [None]:
uber_15.groupby(['weekday','hour'])['Pickup_date'].count()

In [None]:
summary=uber_15.groupby(['weekday','hour'])['Pickup_date'].count().reset_index()
summary.head(2)

In [None]:
summary=summary.rename(columns = {'Pickup_date':'Counts'})
summary.head(2)

In [None]:
plt.figure(figsize=(6,5))
sns.pointplot(x="hour", y="Counts", hue="weekday", data=summary)

#### Conclusion:On weekends, there are more pickups from late night till midnight while on weekdays, there are more pickups in morning and evening

# Perform In-Depth Analysis of Uber Base Number

In [None]:
uber_foil=pd.read_csv(r'E:\Data Science\Data Analyst Projects\UberNewYorkDataAnalysis/Uber-Jan-Feb-FOIL.csv')

In [None]:
uber_foil.head(2)

In [None]:
uber_foil.shape

## Analysing which base number has most active vehicles

In [None]:
uber_foil['dispatching_base_number'].unique()

In [None]:
sns.boxplot(x = 'dispatching_base_number', y = 'active_vehicles', data = uber_foil)

#### Conclusion: B02764 has maximum number of active vehicles

## Analysing which base number has most trips

In [None]:
sns.boxplot(x = 'dispatching_base_number', y = 'trips', data = uber_foil)

#### Conclusion: B02764 has maximum number of active vehicles and maximum number of trips

## How average trips per vehicle increases or decreases with respect to dates with each of the base number

In [None]:
uber_foil['trips/vehicle'] = uber_foil['trips']/uber_foil['active_vehicles']

In [None]:
uber_foil.head(2)

In [None]:
uber_foil.set_index('date').head(2)

In [None]:
plt.figure(figsize=(8,6))
uber_foil.set_index('date').groupby(['dispatching_base_number'])['trips/vehicle'].plot()
plt.ylabel('Average trips/vehicle')
plt.title('Demand vs Supply chart (Date-wise)')
plt.legend()

#### Conclusion: Orange and Purple have performed better while blue has worst performance.