# Step.3 Feature Engineering:

**3.1 Import neccessary libraries**

In [3]:
import pandas as pd
from geopy.distance import geodesic

**3.2 Read the dataset:**  

The dataset here will be the same one cleaned in the previous data cleaning and understanding step, where errors were addressed, missing values were handled, and inconsistencies were resolved.

In [5]:
df = pd.read_csv("datasetfinal2.0.csv")

**3.3 Date Conversion**

In [7]:
# Format Conversion
# List of date columns
date_columns = [
    'order_purchase_timestamp',
    'order_approved_at',
    'order_delivered_carrier_date',
    'order_delivered_customer_date',
    'order_estimated_delivery_date',
    'shipping_limit_date'
]
df[date_columns] = df[date_columns].apply(pd.to_datetime, errors='coerce')

**3.4 Addition of features**

**1. Approved Time:**  

Approved time is the difference between the approval date and the purchase date.

In [10]:
# Approved time( Approve date - Purchase date )
df['approve_time'] = (df['order_approved_at'] - df['order_purchase_timestamp']).dt.days

**2. Seller's shipping time**

Seller's shipping time is the difference between the shipping limit date and the order approval date

In [12]:
# Seller's shipping time (shipping_limit_date - order approve date )
df['seller_shipping_time'] = (df['shipping_limit_date'] - df['order_approved_at']).dt.days

**3. Logistics Time**

Logistics time is the difference between the order delivery date and the shipping limit date.

In [14]:
# Logistics Time (order_delivered - shipping_limit_date)
df['logistics_time'] = (df['order_delivered_customer_date'] - df['shipping_limit_date']).dt.days

**4. Waiting time**

Waiting time is the difference between the customer delivery date and the purchase date.

In [16]:
# Waiting time (Customer delivery date - Purchase date)
df['total_waiting_time'] = (df['order_delivered_customer_date'] - df['order_purchase_timestamp']).dt.days

**5. Late or early arrival time**

Late or early arrival time is the difference between the estimated delivery date and the actual delivery date. A positive value (>= 0) indicates an early arrival, while a negative value (< 0) indicates a late arrival.

In [18]:
# Late or early arrival time (estimated delivery date - actual delivery date), >=0,early;<0,late
df['late_early_arrival_time'] = (df['order_estimated_delivery_date'] - df['order_delivered_customer_date']).dt.days

**6. Estimated delivery time**

Estimated delivery time is the difference between the estimated delivery date and the order approval date.

In [20]:
# Estimated delivery time ( estimated delivery date - order approve date  )
df['estimated_delivery_time'] = (df['order_estimated_delivery_date'] - df['order_approved_at']).dt.days

**7.Actual delivery time:**

Actual delivery time is the difference between the customer delivery date and the order approval date.

In [22]:
# Actual delivery time ( Customer delivery date - order approve date )
df['actual_delivery_time'] = (df['order_delivered_customer_date'] - df['order_approved_at']).dt.days

**8. Delivery accuracy:**

The value is calculated as the ratio of 'actual_delivery_time' to 'estimated_delivery_time'.

In [24]:
# Create the 'delivery_accuracy' column
df['delivery_accuracy'] = df.apply(
    lambda row: row['actual_delivery_time'] / row['estimated_delivery_time']
    if row['estimated_delivery_time'] != 0 else float('nan'), axis=1
)

In [25]:
# Drop the 'actual_delivery_time' and 'estimated_delivery_time' columns
df.drop(['actual_delivery_time', 'estimated_delivery_time'], axis=1, inplace=True)

**9. Product Seller Count:**

The number of sellers for each unique product

In [27]:
# Number of sellers for each product_id
product_id = df.groupby('product_id').count()['seller_id'].index
seller_count = df.groupby('product_id').count()['seller_id'].values
product_seller_count = pd.DataFrame({'product_id':product_id,'sellers_count':seller_count})
df = pd.merge(df,product_seller_count,on='product_id')

**10. Order Items Count:**

The number of products that are ordered as a part of one single order.

In [29]:
# Number of products ordered in each order as a feature
order_id = df.groupby('order_id').count()['product_id'].index
pd_count = df.groupby('order_id').count()['product_id'].values
order_items_count = pd.DataFrame({'order_id':order_id,'products_in_order_count':pd_count})
df = pd.merge(df,order_items_count,on='order_id')

**11. Total Orders:**

The total number of orders per customer and per seller

In [31]:
# Total orders per customer
df['total_orders_per_customer'] = df.groupby('customer_unique_id')['order_id'].transform('count')
# Total orders per seller
df['total_orders_per_seller'] = df.groupby('seller_id')['order_id'].transform('count')

**12. Purchase day:**

When was the product purchased and wether it was weekend or not, which can help us if day of purchase may affect review score.

In [33]:
# Purchase day feature
df['purchase_month'] = df['order_purchase_timestamp'].dt.month
df['is_weekend'] = df['order_purchase_timestamp'].apply(lambda x: 1 if x.weekday() >= 5 else 0)

**13. Order-Freight ratio**

To know if customers paying more freight value when the order is samll, will have an impact on review score.

In [35]:
# Creating feature for order freight ratio
df['order_freight_ratio'] = df['freight_value']/df['payment_value']

**14. On time Delivery:**

Wether the order was delivered on time or not.

In [37]:
# Creating feature if delivery was late
df['on_time_delivery'] = df['order_delivered_customer_date'] < df['order_estimated_delivery_date']
df['on_time_delivery'] = df['on_time_delivery'].astype('int')

**15. Geolocation**

The distance between the customer and seller, if the seller and customer are from the same state or not and if they are from the same city or not, because it might lead to faster delivery which may affect the review score in a positive way.

In [39]:
# Geolocation features
# Use geopy to calculate the distance and add a new column
df['distance_km'] = df.apply(lambda row: geodesic((row['customer_lat'], row['customer_lng']), (row['seller_lat'], row['seller_lng'])).kilometers, axis=1)
df['same_state'] = (df['customer_state'] == df['seller_state']).astype(int)
df['same_city'] = (df['customer_city'] == df['seller_city']).astype(int)

## Finalising the features

In [41]:
# Drop the columns that will not used to feed the model
columns_to_drop = ['review_id',
                   'order_id',
                   'order_purchase_timestamp',
                   'order_approved_at',
                   'order_delivered_carrier_date',
                   'order_delivered_customer_date',
                   'order_estimated_delivery_date',
                   'customer_unique_id',
                   'customer_state',
                   'customer_city',
                   'customer_lat',
                   'customer_lng',
                   'product_id',
                   'seller_id',
                   'shipping_limit_date',
                   'seller_city',
                   'seller_state',
                   'seller_lat',
                   'seller_lng']
df_featuring = df.drop(columns=columns_to_drop, axis=1)

# Drop the NAs if any
df_featuring = df_featuring.dropna()

In [42]:
# Save the file as CSV to be used in modelling
df_featuring.to_csv('Feature.csv', index=False)