In [1]:
#The correlation matrix for ride_weather_data shows that as temperatures (Mean_Temp, Mean_Feels_Like, Min_Temp, Max_Temp) increase, the number of trips tends to decrease. While humidity (Mean_Humidity) has only a slight negative impact on trips, cloudiness (Mean_Cloudiness) is positively correlated, suggesting more trips on cloudier days. Wind speed (Mean_Wind_Speed) appears to have minimal influence on ride frequency.

# Dependencies and Setup
import scipy.stats as stats
from scipy.stats import shapiro
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import time
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

# Uber Data Config

In [None]:
# Files to Load

uber_data_paths = [
    "Data/uber_data_aprsep_2014.csv",
    "Data/uber_data_janjune_2015.csv"
]

uber_ride_data_paths = [
    "Data/rides_per_day_aprsep_2014.csv",
    "Data/rides_per_day_janjune_2015.csv"
]

uber_summary_data_paths = [
    "Data/uber_summary_aprsep_2014.csv",
    "Data/uber_summary_janjune_2015.csv",
]


#### Uber Data with Times

In [None]:
# Initialize an empty list to store uber_ride_data
data_frames = []

# Read each CSV file and append its DataFrame to the list
for file_path in uber_data_paths:
    df = pd.read_csv(file_path)
    data_frames.append(df)

# Concatenate all DataFrames in the list into one DataFrame
uber_data = pd.concat(data_frames, ignore_index=True)
print("Uber Data:")
uber_data.head()



#### WIth Bases Listed

In [None]:
# Initialize an empty list to store uber_ride_data
data_frames = []

# Read each CSV file and append its DataFrame to the list
for file_path in uber_ride_data_paths:
    df = pd.read_csv(file_path)
    data_frames.append(df)

# Concatenate all DataFrames in the list into one DataFrame
uber_ride_data_wb = pd.concat(data_frames, ignore_index=True)
uber_ride_data_wb.head()

#### Without Bases Listed

In [None]:
uber_ride_data_c = uber_ride_data_wb.copy()

# Remove the "Base" column
uber_ride_data_c = uber_ride_data_c.drop(columns=['Base'])

# Convert the "Date" column to datetime if it's not already
uber_ride_data_c['Date'] = pd.to_datetime(uber_ride_data_c['Date'])

# Group by "Date" and calculate the total number of trips for each unique date
uber_ride_data = uber_ride_data_c.groupby('Date')['Number of trips'].sum().reset_index()
uber_ride_data.head()

In [None]:
# Extract 'Number of trips' data
data = uber_ride_data['Number of trips'].values

# Perform the Shapiro-Wilk test
stat, p = shapiro(data)

# Interpretation
alpha = 0.05
if p > alpha:
    print('Data looks Gaussian (fail to reject H0)')
else:
    print('Data does not look Gaussian (reject H0)')


In [None]:
# Set a default style
plt.style.use('seaborn-darkgrid')

# Histogram
plt.figure(figsize=(10,6))
plt.hist(uber_ride_data['Number of trips'], bins=50)
plt.title('Histogram of Number of Trips')
plt.xlabel('Number of trips')
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig("histogram_of_number_of_trips.svg", format='svg', dpi=300)
plt.show()
plt.close()

# Q-Q Plot
plt.figure(figsize=(10,6))
stats.probplot(uber_ride_data['Number of trips'], plot=plt)
plt.title('Q-Q Plot')
plt.tight_layout()
plt.savefig("qq_plot.svg", format='svg', dpi=300)
plt.show()
plt.close()

In [None]:
# Box-Cox Transformed Histogram
transformed_data, lambda_value = stats.boxcox(uber_ride_data['Number of trips'])
plt.figure(figsize=(10,6))
plt.hist(transformed_data, bins=50)
plt.title('Histogram of Transformed Number of Trips')
plt.xlabel('Transformed Number of trips')
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig("histogram_of_transformed_number_of_trips.svg", format='svg', dpi=300)
plt.show()
plt.close()


#### Uber Summary Data

In [None]:
# Initialize an empty list to store DataFrames
data_frames = []

# Read each CSV file and append its DataFrame to the list
for file_path in uber_summary_data_paths:
    df = pd.read_csv(file_path)
    data_frames.append(df)

# Concatenate all DataFrames in the list into one DataFrame
uber_summary_data = pd.concat(data_frames)

uber_summary_data = uber_summary_data.reset_index(drop=True)

uber_summary_data = uber_summary_data.drop([10, 11, 12])

print("Uber Summary Data:")
uber_summary_data

# Weather Integration

In [None]:
weather_path = "Data/weather_data_2014-2015.csv"
weather_daily_path = "Data/weather_daily_summary_2014-2015.csv"

#### Weather Data with Daytimes

In [None]:
# Read each CSV file and append its DataFrame to the list
weather_data = pd.read_csv(weather_path)
weather_data.head()

#### Weather Data by Days

In [None]:
weather_daily = pd.read_csv(weather_daily_path)
weather_daily.head()

# Weather and Ride Data

In [None]:
# Create an empty DataFrame to store the merged data
ride_weather_data = pd.DataFrame()

uber_ride_data['Date'] = pd.to_datetime(uber_ride_data['Date'])
weather_daily['Date'] = pd.to_datetime(weather_daily['Date'])


# Iterate through unique dates in uber_ride_data
for date in uber_ride_data['Date'].unique():
    datetime_value = pd.Timestamp(date)
    date_part = datetime_value.strftime('%Y-%m-%d')
    
    # Filter rows with the current date in uber_ride_data
    uber_rows = uber_ride_data[uber_ride_data['Date'] == date_part]
    
    # Filter rows with the current date in weather_daily
    weather_rows = weather_daily[weather_daily['Date'] == date_part]
    
    # Create a new DataFrame with the merged data
    merged_rows = pd.merge(uber_rows, weather_rows, on='Date')
    
    # Append the merged data to the overall merged_data DataFrame
    ride_weather_data = pd.concat([ride_weather_data, merged_rows], ignore_index=True)

#Saves ride_weather_data to a .csv
ride_weather_data.to_csv('ride_weather_data.csv', index=False)

ride_weather_data

# Visualization

#### Temp vs. Number of Trips

In [None]:
# Temp vs. Number of Trips
plt.figure(figsize=(10,6))
plt.scatter(ride_weather_data["Number of trips"], ride_weather_data["Mean_Temp"], marker="o", edgecolors="black", alpha=0.8)
plt.title("Number of trips vs. Temperature")
plt.xlabel("Number of trips")
plt.ylabel("Mean Temperature (F)")
plt.grid(True)
plt.tight_layout()
plt.savefig("number_of_trips_vs_temperature.svg", format='svg', dpi=300)
plt.show()
plt.close()

#### Humidity Vs. Number of Trips

In [None]:
plt.figure(figsize=(10,6))
plt.scatter(ride_weather_data["Number of trips"], ride_weather_data["Mean_Humidity"], marker="o", edgecolors="black", alpha=0.8)
plt.title("Number of trips vs. Humidity")
plt.xlabel("Number of trips")
plt.ylabel("Mean Humidity (%)")
plt.grid(True)
plt.tight_layout()
plt.savefig("number_of_trips_vs_humidity.svg", format='svg', dpi=300)
plt.show()
plt.close()


#### Max Temp Vs. Number of Trips

In [None]:
# Max Temp vs. Number of Trips
plt.figure(figsize=(10,6))
plt.scatter(ride_weather_data["Number of trips"], ride_weather_data["Max_Temp"], marker="o", edgecolors="black", alpha=0.8)
plt.title("Number of trips vs. Max Temperature")
plt.xlabel("Number of trips")
plt.ylabel("Max Temperature (F)")
plt.grid(True)
plt.tight_layout()
plt.savefig("number_of_trips_vs_max_temperature.svg", format='svg', dpi=300)
plt.show()
plt.close()

#### Min Temp Vs. Number of Trips

In [None]:
# Min Temp vs. Number of Trips
plt.figure(figsize=(10,6))
plt.scatter(ride_weather_data["Number of trips"], ride_weather_data["Min_Temp"], marker="o", edgecolors="black", alpha=0.8)
plt.title("Number of trips vs. Min Temperature")
plt.xlabel("Number of trips")
plt.ylabel("Min Temperature (F)")
plt.grid(True)
plt.tight_layout()
plt.savefig("number_of_trips_vs_min_temperature.svg", format='svg', dpi=300)
plt.show()
plt.close()

#### Cloudiness Vs. Number of Trips

In [None]:
# Cloudiness vs. Number of Trips
plt.figure(figsize=(10,6))
plt.scatter(ride_weather_data["Number of trips"], ride_weather_data["Mean_Cloudiness"], marker="o", edgecolors="black", alpha=0.8)
plt.title("Number of trips vs. Cloudiness")
plt.xlabel("Number of trips")
plt.ylabel("Mean Cloudiness (%)")
plt.grid(True)
plt.tight_layout()
plt.savefig("number_of_trips_vs_cloudiness.svg", format='svg', dpi=300)
plt.show()
plt.close()

#### Wind Speed Vs. Number of Trips

In [None]:
# Wind Speed vs. Number of Trips
plt.figure(figsize=(10,6))
plt.scatter(ride_weather_data["Number of trips"], ride_weather_data["Mean_Wind_Speed"], marker="o", edgecolors="black", alpha=0.8)
plt.title("Number of trips vs. Wind Speed")
plt.xlabel("Number of trips")
plt.ylabel("Mean Wind Speed (mph)")
plt.grid(True)
plt.tight_layout()
plt.savefig("number_of_trips_vs_wind_speed.svg", format='svg', dpi=300)
plt.show()
plt.close()

# Statistics 

In [None]:
# Calculate the correlations
correlations = {
    "Mean Humidity": ride_weather_data["Number of trips"].corr(ride_weather_data["Mean_Humidity"]),
    "Max Temperature": ride_weather_data["Number of trips"].corr(ride_weather_data["Max_Temp"]),
    "Min Temperature": ride_weather_data["Number of trips"].corr(ride_weather_data["Min_Temp"]),
    "Mean Cloudiness": ride_weather_data["Number of trips"].corr(ride_weather_data["Mean_Cloudiness"]),
    "Mean Wind Speed": ride_weather_data["Number of trips"].corr(ride_weather_data["Mean_Wind_Speed"]),
    "Mean Temperature": ride_weather_data["Number of trips"].corr(ride_weather_data["Mean_Temp"])
}

# Convert the correlations to percentages and display them
print("Correlations (as percentages) between Number of Trips and Weather Metrics:\n")
for metric, corr in correlations.items():
    print(f"{metric}: {corr*100:.2f}%")


# Weather and Ride Data  (w/Bases)

In [None]:
# Create an empty DataFrame to store the merged data
ride_weather_data_wb = pd.DataFrame()

uber_ride_data_wb['Date'] = pd.to_datetime(uber_ride_data_wb['Date'])
weather_daily['Date'] = pd.to_datetime(weather_daily['Date'])

# Iterate through unique dates in uber_ride_data_wb
for date in uber_ride_data_wb['Date'].unique():
    date_part = str(date)[:10]  # Convert to string and extract the date part
    
    # Filter rows with the current date in uber_ride_data_wb
    uber_rows = uber_ride_data_wb[uber_ride_data_wb['Date'] == date]
    
    # Filter rows with the current date in weather_daily
    weather_rows = weather_daily[weather_daily['Date'] == date_part]
    
    # Create a new DataFrame with the merged data
    merged_rows = pd.merge(uber_rows, weather_rows, on='Date')
    
    # Append the merged data to the overall ride_weather_data_wb DataFrame
    ride_weather_data_wb = pd.concat([ride_weather_data_wb, merged_rows], ignore_index=True)


In [None]:
# List of weather metrics to compare with Number of Trips
weather_metrics = ["Mean_Temp", "Mean_Humidity", "Max_Temp", "Min_Temp", "Mean_Cloudiness", "Mean_Wind_Speed"]

for metric in weather_metrics:
    # Generate scatter plot
    plt.figure(figsize=(10, 6))
    plt.scatter(ride_weather_data["Number of trips"], ride_weather_data[metric], marker="o", edgecolors="black", alpha=0.8)
    plt.title(f"Number of trips vs. {metric.replace('_', ' ')}")
    plt.xlabel("Number of trips")
    plt.ylabel(metric.replace('_', ' '))
    plt.grid(True)
    
    # Save the plot as an SVG file
    plt.savefig(f"{metric}_vs_trips.svg", format="svg")
    plt.close()  # Close the current figure


In [None]:
# Extract relevant columns from the ride_weather_data
relevant_columns = ['Number of trips', 'Mean_Temp', 'Mean_Feels_Like', 'Min_Temp', 'Max_Temp', 'Mean_Humidity', 'Mean_Wind_Speed', 'Mean_Cloudiness']
filtered_data = ride_weather_data[relevant_columns]

# Compute the correlation matrix
correlation_matrix = filtered_data.corr()

# Display the correlation matrix
print(correlation_matrix)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plotting the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title("Correlation Matrix")
plt.tight_layout()
plt.savefig("Correlation_Matrix.svg", format='svg', dpi=300)
plt.show()
plt.close()
plt.show()
