In [None]:
!pip install kaggle
!mkdir ~/.kaggle
from google.colab import files
files.upload()  # upload your kaggle.json here
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d olistbr/brazilian-ecommerce -p data --unzip


In [None]:
import pandas as pd
import os

files = os.listdir("data")
files


In [None]:
import pandas as pd
import os

path = "data"

orders = pd.read_csv(f"{path}/olist_orders_dataset.csv")
customers = pd.read_csv(f"{path}/olist_customers_dataset.csv")
reviews = pd.read_csv(f"{path}/olist_order_reviews_dataset.csv")
items = pd.read_csv(f"{path}/olist_order_items_dataset.csv")
payments = pd.read_csv(f"{path}/olist_order_payments_dataset.csv")
products = pd.read_csv(f"{path}/olist_products_dataset.csv")
sellers = pd.read_csv(f"{path}/olist_sellers_dataset.csv")


In [None]:
orders.head(), customers.head(), reviews.head()


In [None]:
reviews["target"] = (reviews["review_score"] <= 3).astype(int)
reviews["target"].value_counts(normalize=True)


In [None]:
orders["customer_id"].nunique(), customers["customer_id"].nunique()
reviews["order_id"].nunique(), orders["order_id"].nunique()


In [None]:
orders = orders[orders["order_status"] != "canceled"]


In [None]:
# Merge orders + customers
df = orders.merge(customers, on="customer_id", how="left")

# Add reviews
df = df.merge(reviews[["order_id", "target"]], on="order_id", how="inner")

# Add payments
df = df.merge(payments.groupby("order_id")
              .agg({"payment_sequential":"max",
                    "payment_type":"first",
                    "payment_installments":"max",
                    "payment_value":"sum"})
              .reset_index(),
              on="order_id", how="left")

# Add items (total price, freight, item count)
items_agg = items.groupby("order_id").agg({
    "price":"sum",
    "freight_value":"sum",
    "order_item_id":"count"
}).rename(columns={"order_item_id":"num_items"}).reset_index()

df = df.merge(items_agg, on="order_id", how="left")


In [None]:
date_cols = [c for c in df.columns if "date" in c or "timestamp" in c]
for col in date_cols:
    df[col] = pd.to_datetime(df[col])


In [None]:
df = df.dropna(subset=["target"])


In [None]:
df.head()
df.shape
df["target"].value_counts(normalize=True)


In [None]:
# Delivery time (days)
df["delivery_time"] = (df["order_delivered_customer_date"] - df["order_purchase_timestamp"]).dt.days

# Estimated delivery time
df["estimated_delivery_time"] = (df["order_estimated_delivery_date"] - df["order_purchase_timestamp"]).dt.days

# Delay (delivery - estimate)
df["delay"] = df["delivery_time"] - df["estimated_delivery_time"]

# Late delivery flag
df["late_delivery"] = (df["delay"] > 0).astype(int)


In [None]:
df["purchase_month"] = df["order_purchase_timestamp"].dt.month
df["purchase_day"] = df["order_purchase_timestamp"].dt.day
df["purchase_weekday"] = df["order_purchase_timestamp"].dt.weekday


In [None]:
df["customer_state"] = df["customer_state"].astype("category")
df["customer_city"] = df["customer_city"].astype("category")


In [None]:
df = df.dropna(subset=["delivery_time", "estimated_delivery_time"])


In [None]:
features = [
    "price", "freight_value", "num_items",
    "payment_installments", "payment_value",
    "delivery_time", "estimated_delivery_time",
    "delay", "late_delivery",
    "purchase_month", "purchase_day", "purchase_weekday",
    "customer_city", "customer_state"
]

target = "target"

df_model = df[features + [target]].copy()


In [None]:
num_cols = [
    "price", "freight_value", "num_items",
    "payment_installments", "payment_value",
    "delivery_time", "estimated_delivery_time",
    "delay",
    "purchase_month", "purchase_day", "purchase_weekday"
]

cat_cols = ["customer_city", "customer_state"]


In [None]:
from sklearn.model_selection import train_test_split

X = df_model[num_cols + cat_cols]
y = df_model[target]

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)


In [None]:
len(X_train), len(X_valid), len(X_test)


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ]
)


In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss',
    random_state=42
)


In [None]:
from sklearn.pipeline import Pipeline

model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", xgb)
])


In [None]:
model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import classification_report, roc_auc_score

y_pred = model.predict(X_valid)
y_proba = model.predict_proba(X_valid)[:,1]

print("AUC:", roc_auc_score(y_valid, y_proba))
print(classification_report(y_valid, y_pred))


In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

y_proba = model.predict_proba(X_valid)[:,1]
fpr, tpr, _ = roc_curve(y_valid, y_proba)

plt.plot(fpr, tpr)
plt.plot([0,1],[0,1], "--")
plt.title("ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()

print("AUC:", auc(fpr, tpr))


In [None]:
from sklearn.metrics import precision_recall_curve, average_precision_score

precision, recall, _ = precision_recall_curve(y_valid, y_proba)
ap = average_precision_score(y_valid, y_proba)

plt.plot(recall, precision)
plt.title("Precision-Recall Curve")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.show()

print("Average Precision Score:", ap)


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

y_pred = model.predict(X_valid)
cm = confusion_matrix(y_valid, y_pred)

sns.heatmap(cm, annot=True, fmt='d', cmap="Blues")
plt.title("Confusion Matrix")
plt.ylabel("True")
plt.xlabel("Predicted")
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
import joblib


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import joblib
import shap


In [None]:
!ls -lah /content/data


In [None]:
import pandas as pd

reviews = pd.read_csv("/content/data/olist_order_reviews_dataset.csv")
reviews.head()
reviews.shape


In [None]:
final_df = reviews.copy()

# Create binary target column: 1 = negative review, 0 = positive review
final_df["target"] = (final_df["review_score"] <= 3).astype(int)

final_df[["review_score", "target"]].head()
final_df.shape


In [None]:
final_df.to_csv("/content/data/final_preprocessed_reviews.csv", index=False)
print("✅ File saved successfully!")


In [None]:
!ls -lh /content/data


In [None]:
import pandas as pd

df = pd.read_csv("/content/data/final_preprocessed_reviews.csv")

# Sample 15% of dataset randomly for SHAP experiment
df_sample = df.sample(frac=0.15, random_state=42).reset_index(drop=True)

print(df_sample.shape)
df_sample.head()


In [None]:
from sklearn.model_selection import train_test_split

X = df_sample["review_comment_message"].fillna("")
y = df_sample["target"]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=5000, stop_words="english")),
    ("clf", LogisticRegression(max_iter=1000))
])

pipeline.fit(X_train, y_train)

print("✅ Model trained")


In [None]:
import pandas as pd
import numpy as np

def predict_proba_text(x):
    # Convert numpy array → DataFrame with correct column name
    if isinstance(x, np.ndarray):
        x = pd.DataFrame(x, columns=["review_comment_message"])
    return pipeline.predict_proba(x)


In [None]:
print(type(X_train))
print(type(X_val))
print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)


In [None]:
from sklearn.model_selection import train_test_split

# Fix target column name based on your dataset
target_col = "review_score"   # change if your target name is different

X = final_df.drop(columns=[target_col])
y = final_df[target_col]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("✅ Re-split done. Shapes:")
print(X_train.shape, X_val.shape)


In [None]:
possible_cols = ["review_comment_message", "review_comment_title", "review_text", "text"]

text_col = None
for c in possible_cols:
    if c in X_train.columns:
        text_col = c
        break

if text_col is None:
    raise ValueError("❌ Could not find text column — check final_df.columns()")

print("✅ Using text column:", text_col)


In [None]:
X_train_text = X_train[[text_col]]   # ✅ double brackets keep DataFrame
X_val_text   = X_val[[text_col]]


In [None]:
print(type(X_train_text))
print(type(X_val_text))
print(X_train_text.head())
print(X_train_text.shape, X_val_text.shape)


In [None]:
import shap
import numpy as np

# SHAP sampling
sample_size = int(len(X_val_text) * 0.10)  # 10% text for SHAP
idx = np.random.choice(len(X_val_text), sample_size, replace=False)

X_shap = X_val_text.iloc[idx]

# background sample
X_bg = X_train_text.sample(300, random_state=42)  # small background set

# SHAP kernel explainer
explainer = shap.KernelExplainer(predict_proba_text, X_bg)
shap_values = explainer.shap_values(X_shap, nsamples=100)

print("✅ SHAP values computed successfully!")


In [None]:
# Extract only class 1 SHAP values and remove extra bias column
sv = shap_values[1][:, :-1]

shap.summary_plot(sv, X_shap, feature_names=[text_col])


In [None]:
# Force plot for one instance
i = 0  # first row since only one feature

shap.force_plot(
    explainer.expected_value[1],   # for positive class
    sv[i],                         # SHAP values for that row
    X_shap.iloc[i],                # original text
    matplotlib=True
)


In [None]:
import joblib

# Save pipeline and model
joblib.dump(pipeline, "final_review_model.pkl")
print("✅ Model saved as final_review_model.pkl")


In [None]:
# Load saved model
import joblib
loaded_model = joblib.load("final_review_model.pkl")

# Inference function
def predict_sentiment(text):
    return loaded_model.predict([text])[0]

# Test on sample reviews
test_reviews = [
    "The product was excellent and delivery was fast",
    "Terrible quality, I want a refund!",
    "Average experience, nothing special"
]

for review in test_reviews:
    print(f"Review: {review}")
    print("Prediction:", "Positive ✅" if predict_sentiment(review)==1 else "Negative ❌")
    print("-----")


In [None]:
import numpy as np
np.save("shap_values.npy", shap_values)
print("✅ SHAP values saved")


In [None]:
# ✅ Replace NaN values with empty string
X_val_text = X_val_text.fillna("")


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# ✅ Predict using pipeline on clean text
y_pred = pipeline.predict(X_val_text["review_comment_message"])

print(classification_report(y_val, y_pred))
print("Accuracy:", accuracy_score(y_val, y_pred))

cm = confusion_matrix(y_val, y_pred)
cm


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()


In [None]:
import joblib
joblib.dump(pipeline, "kdd_review_classifier.pkl")
print("✅ Model saved for deployment!")


In [None]:
test_reviews = [
    "Produto chegou antes do prazo e tudo certo", # positive
    "Péssima qualidade, chegou quebrado",        # negative
]

test_df = pd.DataFrame(test_reviews, columns=["review_comment_message"])
preds = pipeline.predict(test_df)

for txt, pred in zip(test_reviews, preds):
    print(f"Review: {txt} → Prediction: {pred}")


In [None]:
import joblib

joblib.dump(pipeline, "kdd_review_model.pkl")
print("✅ Model saved!")


In [None]:
model_loaded = joblib.load("kdd_review_model.pkl")
model_loaded.predict(pd.DataFrame(["Produto ótimo, entrega rápida"], columns=["review_comment_message"]))


In [None]:
sample_reviews = [
    "Produto excelente, recomendo",
    "Horrível, veio quebrado",
    "Entrega rápida, mas qualidade ruim",
    "Muito bom, chegou antes do prazo"
]

df_test = pd.DataFrame(sample_reviews, columns=["review_comment_message"])
y_pred = pipeline.predict(df_test)

for txt, pred in zip(sample_reviews, y_pred):
    print(f"{txt} => {pred}")


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, cmap="Blues", fmt="d")
plt.title("Confusion Matrix")
plt.show()


In [None]:
# ✅ Final inference function for deployment simulation
import joblib
import pandas as pd

# Load saved pipeline
pipeline = joblib.load("kdd_review_model.pkl")

def predict_review(text):
    df = pd.DataFrame([text], columns=["review_comment_message"])
    pred = pipeline.predict(df)[0]
    return "Positive" if pred == 1 else "Negative"

# Test on examples
samples = [
    "Produto excelente, recomendo!",
    "Muito ruim, chegou quebrado e atrasado",
    "Entrega rápida, mas qualidade baixa",
    "Ótimo, compraria novamente!"
]

for s in samples:
    print(f"Review: {s} → Prediction: {predict_review(s)}")


In [None]:
# Save pipeline and SHAP values already done earlier
print("✅ Model + explainability artifacts ready for deployment")


In [None]:
final_df.to_csv("processed_reviews_kdd.csv", index=False)
print("✅ Processed data exported")
