<a href="https://colab.research.google.com/github/AlexPazCodesUCSD/CSE151AGroupProject/blob/main/Milestone2_CSE151A_Group_Project_Amazon_Agent_Rating.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Run to install all the necessary packages for the project

%pip install pandas
%pip install numpy
%pip install matplotlib
%pip install seaborn
%pip install scikit-learn

# Install more packages using %pip as needed



In [None]:
# Loading the necessary packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.distance import geodesic

# Add more packages as needed

## Data Link
[Kaggle](https://www.kaggle.com/datasets/sujalsuthar/amazon-delivery-dataset)
[Github](https://github.com/AlexPazCodesUCSD/CSE151AGroupProject/blob/main/amazon_delivery.csv)

In [None]:
# Data Download into Google Colab

!wget https://raw.githubusercontent.com/AlexPazCodesUCSD/CSE151AGroupProject/main/amazon_delivery.csv

# Run only once, you can see the file by clicking the file icon on the leftside of the colab notebook

In [None]:
# Loading Data

data = pd.read_csv('amazon_delivery.csv')

data.head()

# Data Clean Up

In [None]:
# The initial dimensions of the dataset
initial_dimensions = data.shape

# Identify columns with missing values and count the number of missing values in each
missing_values = data.isnull().sum()
columns_with_missing_values = missing_values[missing_values > 0]

# Print out the columns with missing values and the count of missing values in each
print("Columns with missing values and the count of missing values:")
print(columns_with_missing_values)

# Cleaning the data by dropping rows with missing values
cleaned_data = data.dropna()

# The dimensions of the cleaned dataset
cleaned_dimensions = cleaned_data.shape

# Print out the dimensions
print(f"Initial Data Dimensions: # of observations = {initial_dimensions[0]}")
print(f"Cleaned Data Dimensions: # of observations = {cleaned_dimensions[0]}")

data = cleaned_data

## Creating More Features \(Data Preprocessing\)

In [None]:
# Function to calculate distance in miles between store and drop-off locations
def calculate_distance(row):
    store_location = (row['Store_Latitude'], row['Store_Longitude'])
    drop_location = (row['Drop_Latitude'], row['Drop_Longitude'])
    return geodesic(store_location, drop_location).miles

# Function to determine the delivery season based on the month
def get_season(date):
    month = date.month
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    elif month in [9, 10, 11]:
        return 'Fall'

# Convert Order_Date, Order_Time, and Pickup_Time to appropriate datetime formats
data['Order_Date'] = pd.to_datetime(data['Order_Date'], format='%Y-%m-%d')
data['Order_Time'] = pd.to_datetime(data['Order_Time'], format='%H:%M:%S').dt.time
data['Pickup_Time'] = pd.to_datetime(data['Pickup_Time'], format='%H:%M:%S').dt.time


data['Distance_Miles'] = data.apply(calculate_distance, axis=1)
data['Delivery_Season'] = data['Order_Date'].apply(get_season)

# Add more if desired

data.head()

# Dataset Column Descriptions and Statistics

1. **Order_ID**: Unique identifier for each order.
2. **Agent_Age**: Age of the delivery agent.
3. **Agent_Rating**: Rating of the delivery agent.
4. **Store_Latitude**: Latitude of the store location.
5. **Store_Longitude**: Longitude of the store location.
6. **Drop_Latitude**: Latitude of the drop-off location.
7. **Drop_Longitude**: Longitude of the drop-off location.
8. **Order_Date**: Date of the order.
9. **Order_Time**: Time of the order.
10. **Pickup_Time**: Time the order was picked up.
11. **Weather**: Weather conditions during delivery.
12. **Traffic**: Traffic conditions during delivery.
13. **Vehicle**: Type of vehicle used for delivery.
14. **Area**: Area type (e.g., Urban, Metropolitan).
15. **Delivery_Time**: Time taken for delivery (in minutes).
16. **Category**: Category of the delivered item.
17. **Distance_Miles**: Distance in miles between the store and the drop-off location, calculated using latitude and longitude.
18. **Delivery_Season**: The season during which the delivery was made, derived from the order date.


In [None]:
# Function to analyze and provide information on each column
def analyze_column(data, column_name):
    col_data = data[column_name]
    col_info = {}

    # Check if the column is numerical
    if pd.api.types.is_numeric_dtype(col_data):
        if "latitude" in column_name.lower() or "longitude" in column_name.lower():
            col_info['type'] = 'Geographical coordinates'
            col_info['description'] = 'Latitude and longitude values should not be analyzed statistically.'
        else:
            col_info['type'] = 'Numerical'
            col_info['statistics'] = col_data.describe().to_dict()

    # Check if the column is categorical
    elif pd.api.types.is_string_dtype(col_data):
        if column_name.lower() == 'order_id':
            col_info['type'] = 'Unique identifier'
            col_info['description'] = 'Unique ID for each order.'
        else:
            col_info['type'] = 'Categorical'
            col_info['unique_values'] = col_data.nunique()
            col_info['categories'] = col_data.value_counts().to_dict()

    # Check if the column is datetime
    elif pd.api.types.is_datetime64_any_dtype(col_data):
        col_info['type'] = 'Datetime'
        col_info['description'] = 'Datetime values for orders and pickups.'

    # Check if the column is time
    elif pd.api.types.is_object_dtype(col_data) and column_name.lower().endswith('_time'):
        col_info['type'] = 'Time'
        col_info['description'] = 'Time values for orders and pickups.'

    else:
        col_info['type'] = 'Other'
        col_info['description'] = 'Other type of data.'

    return col_info

column_analysis = {}
for column in data.columns:
    column_analysis[column] = analyze_column(data, column)

for column, analysis in column_analysis.items():
    print(f"Column: {column}")
    for key, value in analysis.items():
        print(f"  {key}: {value}")
    print()


print("First few rows of the updated dataset:")
data.head()


In [None]:
# Perform data exploration steps on the data. With this data we eventually plan on predicting the agent ratings

import matplotlib.pyplot as plt
import seaborn as sns # Import seaborn for plotting

# Distribution of Delivery Time
plt.figure(figsize=(10, 6))
sns.histplot(data['Delivery_Time'], kde=True)
plt.title('Distribution of Delivery Time')
plt.xlabel('Delivery Time (minutes)')
plt.ylabel('Frequency')
plt.show()

# Relationship between Delivery Time and Agent Rating
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Agent_Rating', y='Delivery_Time', data=data)
plt.title('Delivery Time vs. Agent Rating')
plt.xlabel('Agent Rating')
plt.ylabel('Delivery Time (minutes)')
plt.show()

# Impact of Weather on Delivery Time
plt.figure(figsize=(10, 6))
sns.boxplot(x='Weather', y='Delivery_Time', data=data)
plt.title('Impact of Weather on Delivery Time')
plt.xlabel('Weather')
plt.ylabel('Delivery Time (minutes)')
plt.show()

# Distribution of Agent Ratings
plt.figure(figsize=(10, 6))
sns.countplot(x='Agent_Rating', data=data)
plt.title('Distribution of Agent Ratings')
plt.xlabel('Agent Rating')
plt.ylabel('Count')
plt.show()

# Select only numerical columns for correlation
numerical_data = data.select_dtypes(include=['float', 'int'])

# Correlation Heatmap
plt.figure(figsize=(12, 8))
corr_matrix = numerical_data.corr() # Calculate correlation on numerical data
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
# Further Data Exploration and Preprocessing

# 1. Time-based Features:

# Extract hour of the day from Order_Time and Pickup_Time
data['Order_Hour'] = pd.to_datetime(data['Order_Time'], format='%H:%M:%S').dt.hour
data['Pickup_Hour'] = pd.to_datetime(data['Pickup_Time'], format='%H:%M:%S').dt.hour

# Create a feature for time elapsed between order and pickup
data['Order_Pickup_Time_Diff'] = (pd.to_datetime(data['Pickup_Time'], format='%H:%M:%S') -
                                   pd.to_datetime(data['Order_Time'], format='%H:%M:%S')).dt.total_seconds() / 60

# 2. Categorical Feature Encoding:

# Check if the columns exist before one-hot encoding
if all(col in data.columns for col in ['Weather', 'Traffic', 'Vehicle', 'Area', 'Category']):
    data = pd.get_dummies(data, columns=['Weather', 'Traffic', 'Vehicle', 'Area', 'Category'])
else:
    print("Warning: One or more categorical columns not found. Skipping one-hot encoding.")

# 3. Outlier Detection and Handling:

# Visualize the distribution of Delivery_Time to identify potential outliers
plt.figure(figsize=(10, 6))
sns.boxplot(data['Delivery_Time'])
plt.title('Boxplot of Delivery Time')
plt.show()

# using IQR to handle outliers:
Q1 = data['Delivery_Time'].quantile(0.25)
Q3 = data['Delivery_Time'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# data = data[(data['Delivery_Time'] >= lower_bound) & (data['Delivery_Time'] <= upper_bound)]

# 4. Feature Scaling:

# Standardize or normalize numerical features (Agent_Age, Agent_Rating, Distance_Miles, etc.)
# to ensure they have a similar scale, which can improve model performance.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numerical_features = ['Agent_Age', 'Agent_Rating', 'Distance_Miles', 'Order_Hour', 'Pickup_Hour', 'Order_Pickup_Time_Diff']
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# 5. Additional Visualizations:

# Explore relationships between other features and Delivery_Time:

# - Scatter plots for numerical features
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Distance_Miles', y='Delivery_Time', data=data)
plt.title('Distance vs Delivery Time')
plt.xlabel('Distance (Miles)')
plt.ylabel('Delivery Time (minutes)')
plt.show()

# - Boxplots or violin plots for categorical features
plt.figure(figsize=(10, 6))
sns.boxplot(x='Delivery_Season', y='Delivery_Time', data=data)
plt.title('Delivery Season vs Delivery Time')
plt.xlabel('Delivery Season')
plt.ylabel('Delivery Time (minutes)')
plt.show()

# - Pair plots to visualize relationships between multiple variables
sns.pairplot(data[['Agent_Age', 'Agent_Rating', 'Distance_Miles', 'Delivery_Time']])
plt.show()

# 6. Feature Engineering:

# Day of the week and weekend indicator
data['Day_of_Week'] = data['Order_Date'].dt.dayofweek # Monday=0, Sunday=6
data['Is_Weekend'] = data['Day_of_Week'].apply(lambda x: 1 if x >= 5 else 0)



