In [1]:
import pandas as pd
import numpy as np

**Load the dataset**

In [4]:
root_path = "../DataSet/"

order_reviews = pd.read_csv(root_path + 'olist_order_reviews_dataset.csv')

**Inspect the data**

In [5]:
print("Initial Order Reviews Dataset Info:")
print(order_reviews.info())
print("\nMissing Values:")
print(order_reviews.isnull().sum())

Initial Order Reviews Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99224 entries, 0 to 99223
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   review_id                99224 non-null  object
 1   order_id                 99224 non-null  object
 2   review_score             99224 non-null  int64 
 3   review_comment_title     11568 non-null  object
 4   review_comment_message   40977 non-null  object
 5   review_creation_date     99224 non-null  object
 6   review_answer_timestamp  99224 non-null  object
dtypes: int64(1), object(6)
memory usage: 5.3+ MB
None

Missing Values:
review_id                      0
order_id                       0
review_score                   0
review_comment_title       87656
review_comment_message     58247
review_creation_date           0
review_answer_timestamp        0
dtype: int64


**Drop rows with missing critical IDs**

In [6]:
order_reviews = order_reviews.dropna(subset=['review_id', 'order_id'])

**Fill missing comments**

In [7]:
order_reviews['review_comment_title'] = order_reviews['review_comment_title'].fillna('No Comment')
order_reviews['review_comment_message'] = order_reviews['review_comment_message'].fillna('No Comment')

**Convert data types**

In [8]:
order_reviews['review_id'] = order_reviews['review_id'].astype(str)
order_reviews['order_id'] = order_reviews['order_id'].astype(str)
order_reviews['review_score'] = order_reviews['review_score'].astype(int)
order_reviews['review_creation_date'] = pd.to_datetime(order_reviews['review_creation_date'], errors='coerce')
order_reviews['review_answer_timestamp'] = pd.to_datetime(order_reviews['review_answer_timestamp'], errors='coerce')

**Validate review scores**

In [9]:
order_reviews = order_reviews[order_reviews['review_score'].between(1, 5)]

**Creative addition: Calculate response time**

In [10]:
order_reviews['response_time_days'] = (order_reviews['review_answer_timestamp'] - 
                                       order_reviews['review_creation_date']).dt.days

**Save the cleaned file**

In [11]:
order_reviews.to_csv('./Data_Cleaned/cleaned_olist_order_reviews_dataset.csv', index=False)
print("Saved cleaned order reviews dataset as './Data_Cleaned/cleaned_olist_order_reviews_dataset.csv'")

Saved cleaned order reviews dataset as './Data_Cleaned/cleaned_olist_order_reviews_dataset.csv'
