In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from fpdf import FPDF
from datetime import timedelta
from sklearn.preprocessing import LabelEncoder

#### Load the dataset

In [2]:
df = pd.read_csv('UberDataset.csv')

#### Display the first few rows

In [3]:
df.head()

Unnamed: 0,START_DATE,END_DATE,CATEGORY,START,STOP,MILES,PURPOSE
0,01-01-2016 21:11,01-01-2016 21:17,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain
1,01-02-2016 01:25,01-02-2016 01:37,Business,Fort Pierce,Fort Pierce,5.0,
2,01-02-2016 20:25,01-02-2016 20:38,Business,Fort Pierce,Fort Pierce,4.8,Errand/Supplies
3,01-05-2016 17:31,01-05-2016 17:45,Business,Fort Pierce,Fort Pierce,4.7,Meeting
4,01-06-2016 14:42,01-06-2016 15:49,Business,Fort Pierce,West Palm Beach,63.7,Customer Visit


#### Display dataset summary statistics

In [4]:
df.describe()

Unnamed: 0,MILES
count,1156.0
mean,21.115398
std,359.299007
min,0.5
25%,2.9
50%,6.0
75%,10.4
max,12204.7


#### Display dataset info

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1156 entries, 0 to 1155
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   START_DATE  1156 non-null   object 
 1   END_DATE    1155 non-null   object 
 2   CATEGORY    1155 non-null   object 
 3   START       1155 non-null   object 
 4   STOP        1155 non-null   object 
 5   MILES       1156 non-null   float64
 6   PURPOSE     653 non-null    object 
dtypes: float64(1), object(6)
memory usage: 63.3+ KB


# Data Preprocessing

In [6]:
df['START_DATE'] = pd.to_datetime(df['START_DATE'], errors='coerce')
df['END_DATE'] = pd.to_datetime(df['END_DATE'], errors='coerce')

missing_values = df.isnull().sum()
print(f"Missing values:\n{missing_values}")

df['PURPOSE'].fillna('Unknown', inplace=True)

df.dropna(subset=['START_DATE', 'END_DATE'], inplace=True)

df['DAY_OF_WEEK'] = df['START_DATE'].dt.weekday
df['WEEKEND'] = df['DAY_OF_WEEK'].isin([5, 6])  # 1 for weekends, 0 for weekdays
df['TRIP_DURATION'] = df['END_DATE'] - df['START_DATE']
df['TRIP_DURATION_MIN'] = df['TRIP_DURATION'].dt.total_seconds() / 60

label_encoder = LabelEncoder()
df['CATEGORY'] = label_encoder.fit_transform(df['CATEGORY'])
df['PURPOSE'] = label_encoder.fit_transform(df['PURPOSE'])

Missing values:
START_DATE    735
END_DATE      736
CATEGORY        1
START           1
STOP            1
MILES           0
PURPOSE       503
dtype: int64


# Data Visualization

In [7]:
sns.set(style="whitegrid")

In [8]:
plt.figure(figsize=(12, 6))
sns.countplot(y='PURPOSE', data=df, order=df['PURPOSE'].value_counts().index, palette='viridis')
plt.title('Most Common Purposes for Uber Trips')
plt.xlabel('Count')
plt.ylabel('Purpose')
plt.savefig('plot_common_purposes.png')
plt.close()

plt.figure(figsize=(12, 6))
sns.boxplot(x='MILES', y='PURPOSE', data=df, palette='viridis')
plt.title('Uber Trip Mileage Trends by Purpose')
plt.xlabel('Miles')
plt.ylabel('Purpose')
plt.savefig('plot_mileage_purpose.png')
plt.close()

df['Hour'] = df['START_DATE'].dt.hour
plt.figure(figsize=(12, 6))
sns.boxplot(x='Hour', y='MILES', data=df, palette='viridis')
plt.title('Uber Trip Mileage Trends by Time of Day')
plt.xlabel('Hour of Day')
plt.ylabel('Miles')
plt.savefig('plot_mileage_hour.png')
plt.close()

plt.figure(figsize=(12, 6))
sns.boxplot(x='CATEGORY', y='MILES', data=df, palette='viridis')
plt.title('Comparative Overview of Trip Lengths for Business and Personal Categories')
plt.xlabel('Category')
plt.ylabel('Miles')
plt.savefig('plot_mileage_category.png')
plt.close()


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(y='PURPOSE', data=df, order=df['PURPOSE'].value_counts().index, palette='viridis')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='MILES', y='PURPOSE', data=df, palette='viridis')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Hour', y='MILES', data=df, palette='viridis')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='CATEGORY', y='MILES', data=df, palette='viridis')


# PDF Report Generation

In [11]:
class PDF(FPDF):
    def header(self):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, 'Uber Data Analysis Report', 0, 1, 'C')

    def chapter_title(self, title):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, title, 0, 1, 'L')
        self.ln(10)

    def chapter_body(self, body):
        self.set_font('Arial', '', 12)
        self.multi_cell(0, 10, body)
        self.ln()

    def add_image(self, image_path, x, y, w, h):
        self.image(image_path, x=x, y=y, w=w, h=h)

# Create instance of PDF
pdf = PDF()

# Add a page
pdf.add_page()

# Title Page
pdf.set_font('Arial', 'B', 16)
pdf.cell(0, 10, 'Uber Data Analysis Report', 0, 1, 'C')
pdf.ln(10)
pdf.set_font('Arial', 'I', 12)
pdf.cell(0, 10, 'An In-depth Exploratory Data Analysis', 0, 1, 'C')
pdf.ln(20)

# Summary Section
pdf.chapter_title('Summary')
summary_text = """
This report provides an in-depth analysis of Uber trip data. It explores the most common purposes for Uber trips, 
trip mileage trends by purpose and time of day, and provides a comparative overview of trip lengths for business 
and personal categories.
"""
pdf.chapter_body(summary_text)

# Detailed Analysis Section
pdf.add_page()
pdf.chapter_title('Detailed Analysis')

# Most Common Purposes for Uber Trips
pdf.chapter_title('Most Common Purposes for Uber Trips')
pdf.add_image('plot_common_purposes.png', x=10, y=50, w=180, h=100)
pdf.ln(110)

# Mileage Trends by Purpose
pdf.chapter_title('Mileage Trends by Purpose')
pdf.add_image('plot_mileage_purpose.png', x=10, y=160, w=180, h=100)
pdf.add_page()

# Mileage Trends by Time of Day
pdf.chapter_title('Mileage Trends by Time of Day')
pdf.add_image('plot_mileage_hour.png', x=10, y=50, w=180, h=100)
pdf.ln(110)

# Comparative Overview of Trip Lengths
pdf.chapter_title('Comparative Overview of Trip Lengths')
pdf.add_image('plot_mileage_category.png', x=10, y=160, w=180, h=100)
pdf.add_page()

# Recommendations Section
pdf.chapter_title('Recommendations')
recommendations_text = """
For a more comprehensive analysis, the dataset should include the following additional columns:
1. Trip Fare: The fare for each trip to analyze cost efficiency.
2. Trip Duration: Duration of each trip to study time efficiency.
3. Driver Rating: Driver ratings to correlate trip purpose and customer satisfaction.

By including these data points, more insightful analysis can be conducted, leading to better business decisions.
"""
pdf.chapter_body(recommendations_text)

# Conclusion Section
pdf.chapter_title('Conclusion')
conclusion_text = """
This analysis highlights key insights into Uber trip purposes, mileage trends, and trip lengths. Incorporating 
the recommended data points will further enhance the understanding and efficiency of Uber trips, providing valuable 
information for strategic business decisions.
"""
pdf.chapter_body(conclusion_text)

# Save the PDF
pdf.output('Enhanced_Uber_Data_Analysis_Report.pdf')

''