# Importing all necessary packages

In [None]:
!pip install opendatasets

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [None]:
# Data Processing
import pandas as pd
pd.set_option('display.max_columns',None) # Setting DataFrame display options
import numpy as np
from datetime import datetime

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="white", palette=None) # Setting visualization style

# Geodata Visualization
import shapely
import folium

# Data Extraction
import os
import opendatasets as od

# Statistical Analysis
from scipy.stats import chi2_contingency

# Previewing dataset

## Connecting to my kaggle account and downloading relevant dataset

In [None]:
od.download('https://www.kaggle.com/datasets/brllrb/uber-and-lyft-dataset-boston-ma')

## Previewing Dataset

In [None]:
# os.listdir('./uber-and-lyft-dataset-boston-ma')
rides = pd.read_csv('uber-and-lyft-dataset-boston-ma/rideshare_kaggle.csv')
rides.info()

In [None]:
print(rides.shape)
rides.head()

**Important Notes.**
                                                                                

1.   Multiple Sources of Time data (timestanp & datetime) Consider the timezone column if you're using datetime. (Resolved)
2.   Investigate Temperature and Apparent Temperature. (Resolved)
3.   Not all columns seems useful, drop irrelevant columns as metadata. (Resolved)

# Cleaning dataset

## Subsetting relevant columns

In [None]:
relevant_columns = ['datetime', 'timezone', 'hour', 'month', 'price', 'distance','product_id', 'latitude', 'longitude', 'temperature', 'apparentTemperature',
                    'cab_type','source','destination','surge_multiplier','short_summary','long_summary','icon']
rides_analysis = rides[relevant_columns].copy()
rides_analysis=rides_analysis[rides_analysis['cab_type']=='Uber']

In [None]:
rides_analysis.shape

## Checking for missing data

In [None]:
rides_analysis.isna().sum()

Price is the target feature, all observations without price should be dropped.                                    

In [None]:
rides_analysis.dropna(inplace=True)
rides_analysis.isna().sum()

## Confirm Datatypes.

In [None]:
print(rides_analysis.dtypes)
rides_analysis.head()

1.   datetime column is in the wrong format. convert it to datetime format. (Resolved)
2.   hour and month columns should be categorical. (Resolved)

In [None]:
rides_analysis['datetime']=pd.to_datetime(rides_analysis.datetime,format='%Y-%m-%d %H:%M:%S')
rides_analysis['hour']=rides_analysis.hour.astype('category')
rides_analysis['month']=rides_analysis.month.astype('category')
rides_analysis.dtypes

## Create More Features.

1.   Extract the day of the week data from datetime.
2.   Create POINT data from Longuitude and latitude columns.

In [None]:
rides_analysis['day_of_week']=[day.strftime('%A') for day in rides_analysis.datetime.to_list()]
rides_analysis['day_of_week']=rides_analysis.day_of_week.astype('category')
rides_analysis['location']=[shapely.Point(row.latitude,row.longitude) for index, row in rides_analysis.iterrows()]

rides_analysis.head()

# Exploratory Data Analysis

In [None]:
columns=rides_analysis.columns.to_list()
columns.remove('price')
columns.append('price')
rides_analysis=rides_analysis[columns]
rides_analysis.head()

In [None]:
print(rides_analysis.timezone.unique())
print(rides_analysis.surge_multiplier.unique())

Timezone and surge_multiplier both have only one unique value. Drop them both. (Resolved)

In [None]:
rides_analysis=rides_analysis.drop(columns=['timezone','surge_multiplier'])

## Checking for correlations in the datasets between distance and price


In [None]:
sns.relplot(
            x='distance',
            y='price',
            data=rides_analysis,
            kind='scatter',
            height=6,
            aspect=10/6,
            alpha=0.2
            ).set(title='Scatter Plot Of Distance vs Price Split')

print(f'The Pearson Correlation between the price and the distance travelled is {rides_analysis.price.corr(rides_analysis.distance)}')

There is a weak correlation between distance and price.


## Checking for Busiest day

In [None]:
week=rides_analysis.value_counts('day_of_week')

sns.catplot(x='day_of_week',
            data=rides_analysis,
            kind='count',
            height=6,
            aspect=10/6,
            order=week.index).set(title='Count of Rides By Day of Week')

Monday and Tuesday have significantly more rides than other days

## Hour_of_day_analysis

In [None]:
order=rides_analysis.value_counts('hour')
sns.catplot(x='hour',
            data=rides_analysis,
            kind='count',
            height=6,
            aspect=10/6,
            order=order.index
            ).set(title='Distribution of rides during the day')

Business is usually busiest at midnight. Typical work closing times like 2pm,4pm and 6pm also see significant traffic.      

## Checking For Busiest Month                                        

In [None]:
order=rides_analysis.value_counts('month')
sns.catplot(x='month',
            data=rides_analysis,
            kind='count',
            height=6,
            aspect=10/6,
            order=order.index
            ).set(title='Distribution of rides during by month')

December recorded significantly more rides than November. This could due to the holiday season.

## Effect_of_temperature_and_apparent_temperature

To enable visualization, I'll bin the temperature colummns into categories.

In [None]:
rides_analysis['temperature_summary']=pd.cut(rides_analysis.temperature,bins=[-np.inf,20,30,37,49,np.inf],labels=['Cool','Warm','Hot','Very_Hot','Extremely_Hot'])
rides_analysis['apparent_temperature_summary']=pd.cut(rides_analysis.apparentTemperature,bins=[-np.inf,20,30,37,49,np.inf],labels=['Cool','Warm','Hot','Very_Hot','Extremely_Hot'])

In [None]:
order=rides_analysis.value_counts('temperature_summary')
sns.catplot(
            x='temperature_summary',
            data=rides_analysis,
            kind='count',
            height=6,
            aspect=10/6,
            order=order.index
            ).set(title='Distribution of Rides Based on Temperature')

In [None]:
order=rides_analysis.value_counts('apparent_temperature_summary')
sns.catplot(
            x='apparent_temperature_summary',
            data=rides_analysis,
            kind='count',
            height=6,
            aspect=10/6,
            order=order.index
            ).set(title='Distribution of Rides Based on Apparent Temperature')

Apparent temperature seems to have more effect on whether or not people book rides.

## Types_of_cab


In [None]:
rides_analysis.product_id.value_counts()

In [None]:
sns.catplot(
            y='product_id',
            kind='count',
            data=rides_analysis,
            height=6,
            aspect=10/6
            ).set(title='Distribution of Rides Based On Cab Package')
plt.xticks(rotation=90);

No difference based on Cab Package       

## Origin of Rides        

In [None]:
origin = rides_analysis.location.value_counts()
origin

In [None]:
new_york_center=[rides_analysis.iloc[0,18].x, rides_analysis.iloc[0,18].y]
new_york=folium.Map(location=new_york_center,zoom_start=14)

for index in origin.index:
  location=[index.x,index.y]
  icon=folium.Icon(color='red')
  marker=folium.Marker(location=location,icon=icon)
  marker.add_to(new_york)

display(new_york)

There is a decent spread of pickup locations in the dataset.
                                                           

## Source/Destination

In [None]:
rides_analysis.source.value_counts()

In [None]:
order=rides_analysis.value_counts('source')
sns.catplot(
            y='source',
            data=rides_analysis,
            kind='count',
            height=6,
            aspect=10/6,
            order=order.index
            ).set(title='Distribution of Rides Based on Source')

In [None]:
rides_analysis.destination.value_counts()

In [None]:
order=rides_analysis.value_counts('destination')
sns.catplot(
            y='destination',
            data=rides_analysis,
            kind='count',
            height=6,
            aspect=10/6,
            order=order.index
            ).set(title='Distribution of Rides Based on destination')

The distribution of rides by the source differs from the distribution generated by the longitude and latitude data. The reason for this could be greater specificity with the POINT data.

## Icon

In [None]:
rides_analysis.icon.value_counts()

In [None]:
order=rides_analysis.value_counts('icon')
sns.catplot(y='icon',
            data=rides_analysis,
            kind='count',
            height=6,
            aspect=10/6,
            order=order.index
            ).set(title='Distribution of rides during by icon')

This distribution could also be due to the specific weather conditions of the time period the data was collected.

## Long summary/Short summary

In [None]:
rides_analysis.short_summary.value_counts()

In [None]:
order=rides_analysis.value_counts('short_summary')
sns.catplot(
            y='short_summary',
            data=rides_analysis,
            kind='count',
            height=6,
            aspect=10/6,
            order=order.index
            ).set(title='Distribution of Rides Based on short_summary')

In [None]:
rides_analysis.long_summary.value_counts()

In [None]:
order=rides_analysis.value_counts('long_summary')
sns.catplot(
            y='long_summary',
            data=rides_analysis,
            kind='count',
            height=6,
            aspect=10/6,
            order=order.index
            ).set(title='Distribution of Rides Based on long_summary')

The description of the weather by the short summary, long summary and icon are very similar and the distributions are similar too.

# Check for colinearity.

Null Hypothesis: There is no significant association between the categorical variables.

Alternative Hypothesis: There is a significant association between the categorical variables.

In [None]:
contingency_table=pd.crosstab([rides_analysis.short_summary,rides_analysis.long_summary],rides_analysis.icon)
contingency_table

In [None]:
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

print("Chi-squared test statistic:", chi2)
print("P-value:", p_value)
print("Degrees of Freedom:", dof)
print("Expected frequencies:")

In [None]:
pd.DataFrame(expected, index=contingency_table.index, columns=contingency_table.columns)

With a p-value of 0.0, there is enough evidence to reject the null hypothesis. ie, the three variables are multicolinear.

# Features Relationship With The Target Variable                                                                            

In [None]:
rides_analysis.columns

In [None]:
rides_analysis=rides_analysis.drop(columns=['datetime', 'timezone'])

## Relationship between 'month' and 'price'

In [None]:
rides_analysis.groupby('month').mean(numeric_only=True)['price']

In [None]:
sns.catplot(
    kind='box',
    data=rides_analysis,
    x='month',
    y='price'
).set(title='Distribution of Prices By Month')

No visible difference

##  Relationship between 'hour' and 'price'

In [None]:
rides_analysis.groupby('hour').mean(numeric_only=True)['price'].sort_values(ascending=False)

In [None]:
hours=rides_analysis.groupby('hour')['price'].mean().sort_values(ascending=False)
sns.catplot(
    kind='box',
    x='hour',
    y='price',
    data=rides_analysis,
    order=hours.index,
    height=6,
    aspect=10/6
).set(title="Distribution of Prices by hour")


No visible difference

##  Relationship between 'day_of_week' and 'price'

In [None]:
rides_analysis.groupby('day_of_week').mean(numeric_only=True)['price'].sort_values(ascending=False)

In [None]:
hours=rides_analysis.groupby('day_of_week')['price'].mean().sort_values(ascending=False)
sns.catplot(
    kind='box',
    x='day_of_week',
    y='price',
    data=rides_analysis,
    order=hours.index,
    height=6,
    aspect=10/6
).set(title="Distribution of Prices by day_of_week")


No visible difference

##  Relationship between 'product_id' and 'price'

In [None]:
rides_analysis.groupby('product_id').median(numeric_only=True)['price'].sort_values(ascending=False)

In [None]:
hours=rides_analysis.groupby('product_id')['price'].mean().sort_values(ascending=False)
sns.catplot(
    kind='box',
    y='product_id',
    x='price',
    data=rides_analysis,
    order=hours.index,
    height=6,
    aspect=10/6
).set(title="Distribution of Prices by product_id")

plt.xticks(rotation=90);

The distribution of prices is greatly influenced by the cab_type (product_id) being used. This suggests that different cab_types have different base costs or rates.

##  Relationship between 'temperature'/'apparentTemperature' and 'price'

In [None]:
print(f"The correlation between the temperature and the price is {rides_analysis.temperature.corr(rides_analysis.price)}.")
print(f"The correlation between the apparent_temperature and the price is {rides_analysis.apparentTemperature.corr(rides_analysis.price)}.")

In [None]:
sns.relplot(
    kind='scatter',
    data=rides_analysis,
    x='price',
    y='temperature',
    height=6,
    aspect=10/6,
    alpha=0.2
).set(title='scatterplot of Temperature vs Price');

In [None]:
sns.relplot(
    kind='scatter',
    data=rides_analysis,
    x='price',
    y='apparentTemperature',
    height=6,
    aspect=10/6,
    alpha=0.2
).set(title='scatterplot of Apparent_Temperature vs Price');

No linear relationship exists on either features. Analyzing the binned categorical feature created earlier might prove more useful (temperature_summary).

##  Relationship between 'temperature_summary'/'apparent_temperature_summary' and 'price'

In [None]:
rides_analysis.groupby('temperature_summary').mean(numeric_only=True)['price'].sort_values(ascending=False)

In [None]:
hours=rides_analysis.groupby('temperature_summary')['price'].mean().sort_values(ascending=False)
sns.catplot(
    kind='box',
    x='temperature_summary',
    y='price',
    data=rides_analysis,
    order=hours.index,
    height=6,
    aspect=10/6
).set(title="Distribution of Prices by temperature_summary")


In [None]:
rides_analysis.groupby('apparent_temperature_summary').mean(numeric_only=True)['price'].sort_values(ascending=False)

In [None]:
hours=rides_analysis.groupby('apparent_temperature_summary')['price'].mean().sort_values(ascending=False)
sns.catplot(
    kind='box',
    x='apparent_temperature_summary',
    y='price',
    data=rides_analysis,
    order=hours.index,
    height=6,
    aspect=10/6
).set(title="Distribution of Prices by apparent_temperature_summary")


Slight differences between the two groups. Similarities between the two groups might suggest colinearity. Both distributions show that there is a slight increase in average price on hotter days. This might suggest that people simply book longer distance rides on hotter days.                        

## Relationship Between Distance and Apparent Temperature Summary.

In [None]:
rides_analysis.groupby('apparent_temperature_summary').mean(numeric_only=True)['distance'].sort_values(ascending=False)

In [None]:
hours=rides_analysis.groupby('apparent_temperature_summary')['distance'].mean().sort_values(ascending=False)
sns.catplot(
    kind='box',
    x='apparent_temperature_summary',
    y='distance',
    data=rides_analysis,
    order=hours.index,
    height=6,
    aspect=10/6
).set(title="Distribution of distances by apparent_temperature_summary")

## Relationship between 'source'/'destination' and 'price'

In [None]:
order=rides_analysis.groupby('source').mean(numeric_only=True)['price'].sort_values(ascending=False)
order

In [None]:
sns.catplot(
    kind='box',
    y='source',
    x='price',
    data=rides_analysis,
    order=order.index,
    height=6,
    aspect=10/6
).set(title="Distribution of prices by source")

In [None]:
order=rides_analysis.groupby('destination').mean(numeric_only=True)['price'].sort_values(ascending=False)
order

In [None]:
sns.catplot(
    kind='box',
    y='destination',
    x='price',
    data=rides_analysis,
    order=order.index,
    height=6,
    aspect=10/6
).set(title="Distribution of prices by destination")

## **Check_for_collinearity**

## Relationship between 'short_summary'/'long_summary' and 'price'

In [None]:
order=rides_analysis.groupby('short_summary').mean(numeric_only=True)['price'].sort_values(ascending=False)
order

# Data Preprocessing

This process involves feature selection and engineering.


From the EDA performed,                        

# Finally,_build_models

# Notes

1. There is a weak correlation between distance and price.
2. Monday and Tuesday have significantly more rides than other days
3. Business is usually busiest at midnight. Typical work closing times like 2pm,4pm and 6pm also see significant traffic.      
4. December recorded significantly more rides than November.  
5. Apparent temperature seems to have more effect on whether or not people book rides.
6. No difference based on Cab Package       
7. There is a decent spread of pickup locations in the dataset.
8. The distribution of prices is greatly influenced by the cab_type (product_id) being used. This suggests that different cab_types have different base costs or rates.

Temperature summary and apparent temperature summary
1. Slight differences between the two groups. Similarities between the two groups might suggest colinearity.
2. Both distributions show that there is a slight increase in average price on hotter days.    