In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score


In [3]:
# Load your dataset
aisles = pd.read_csv('/content/aisles.csv')
departments = pd.read_csv('/content/departments.csv')
order_products_prior = pd.read_csv('/content/order_products__prior.csv')
order_products_train = pd.read_csv('/content/order_products__train.csv')
orders = pd.read_csv('/content/orders.csv')
products = pd.read_csv('/content/products.csv')


In [4]:
# Merge relevant dataframes (example: order history, product info)
order_history = pd.merge(order_products_prior, orders, on='order_id', how='inner')
order_history = pd.merge(order_history, products, on='product_id', how='inner')



In [5]:
# Define the problem statement: Predict whether a product will be reordered in the next order
order_history['reordered_previous'] = order_history.groupby(['user_id', 'product_id'])['reordered'].shift().fillna(0).astype(int)


In [6]:
# Feature Engineering
# User-specific features
user_features = order_history.groupby('user_id').agg({
    'order_id': 'nunique',  # Number of orders per user
    'product_id': 'nunique',  # Number of unique products ordered by the user
    'reordered': 'mean',  # Reorder rate for the user
}).reset_index()
user_features.rename(columns={
    'order_id': 'user_orders',
    'product_id': 'user_unique_products',
    'reordered': 'user_reorder_rate'
}, inplace=True)

In [7]:
# Product-specific features
product_features = order_history.groupby('product_id').agg({
    'reordered_previous': 'mean',  # Reorder rate for the product in previous orders
    'add_to_cart_order': 'mean',  # Average position in cart for the product
}).reset_index()
product_features.rename(columns={
    'reordered_previous': 'product_reorder_rate',
    'add_to_cart_order': 'product_avg_cart_position'
}, inplace=True)


In [8]:
# Merge user and product features back to the dataset
order_history = pd.merge(order_history, user_features, on='user_id', how='left')
order_history = pd.merge(order_history, product_features, on='product_id', how='left')


In [9]:
# Features and target variable
X = order_history[['user_orders', 'user_unique_products', 'user_reorder_rate',
                   'product_reorder_rate', 'product_avg_cart_position']]
y = order_history['reordered']


In [10]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize XGBoost classifier
model = XGBClassifier()








In [11]:
# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)


In [12]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 69.06%


NEW

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE

# Load data
aisles = pd.read_csv('/content/aisles.csv')
departments = pd.read_csv('/content/departments.csv')
order_products_prior = pd.read_csv('/content/order_products__prior.csv')
order_products_train = pd.read_csv('/content/order_products__train.csv')
orders = pd.read_csv('/content/orders.csv')
products = pd.read_csv('/content/products.csv')

 # Merge relevant dataframes
order_history = pd.merge(order_products_prior, orders, on='order_id', how='inner')
order_history = pd.merge(order_history, products, on='product_id', how='inner')

# Define problem statement: Predict whether a product will be reordered in the next order
order_history['reordered_previous'] = order_history.groupby(['user_id', 'product_id'])['reordered'].shift().fillna(0).astype(int)

# Feature Engineering
# User-specific features
user_features = order_history.groupby('user_id').agg({
    'order_id': 'nunique',  # Number of orders per user
    'product_id': 'nunique',  # Number of unique products ordered by the user
    'reordered': 'mean',  # Reorder rate for the user
}).reset_index()
user_features.rename(columns={
    'order_id': 'user_orders',
    'product_id': 'user_unique_products',
    'reordered': 'user_reorder_rate'
}, inplace=True)

# Product-specific features
product_features = order_history.groupby('product_id').agg({
    'reordered_previous': 'mean',  # Reorder rate for the product in previous orders
    'add_to_cart_order': 'mean',  # Average position in cart for the product
}).reset_index()
product_features.rename(columns={
    'reordered_previous': 'product_reorder_rate',
    'add_to_cart_order': 'product_avg_cart_position'
}, inplace=True)

# Merge user and product features back to the dataset
order_history = pd.merge(order_history, user_features, on='user_id', how='left')
order_history = pd.merge(order_history, product_features, on='product_id', how='left')

# Features and target variable
X = order_history[['user_orders', 'user_unique_products', 'user_reorder_rate',
                   'product_reorder_rate', 'product_avg_cart_position']]
y = order_history['reordered']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Initialize XGBoost and Random Forest classifiers
xgb_model = XGBClassifier()
rf_model = RandomForestClassifier()

# Hyperparameter tuning using GridSearchCV for XGBoost
xgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1],
}

xgb_grid_search = GridSearchCV(estimator=xgb_model, param_grid=xgb_param_grid, scoring='accuracy', cv=5, verbose=1)
xgb_grid_search.fit(X_train_resampled, y_train_resampled)

best_xgb_model = xgb_grid_search.best_estimator_

# Hyperparameter tuning using GridSearchCV for Random Forest
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 4, 5],
}

rf_grid_search = GridSearchCV(estimator=rf_model, param_grid=rf_param_grid, scoring='accuracy', cv=5, verbose=1)
rf_grid_search.fit(X_train_resampled, y_train_resampled)

best_rf_model = rf_grid_search.best_estimator_

# Make predictions on the test set using both models
y_pred_xgb = best_xgb_model.predict(X_test)
y_pred_rf = best_rf_model.predict(X_test)

# Calculate and compare accuracy, precision, recall, F1-score, and ROC-AUC for both models
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred)

    return accuracy, precision, recall, f1, roc_auc

accuracy_xgb, precision_xgb, recall_xgb, f1_xgb, roc_auc_xgb = evaluate_model(y_test, y_pred_xgb)
accuracy_rf, precision_rf, recall_rf, f1_rf, roc_auc_rf = evaluate_model(y_test, y_pred_rf)

print("XGBoost Metrics:")
print(f"Accuracy: {accuracy_xgb * 100:.2f}%")
print(f"Precision: {precision_xgb:.2f}")
print(f"Recall: {recall_xgb:.2f}")
print(f"F1 Score: {f1_xgb:.2f}")
print(f"ROC-AUC Score: {roc_auc_xgb:.2f}")
print("\nRandom Forest Metrics:")
print(f"Accuracy: {accuracy_rf * 100:.2f}%")
print(f"Precision: {precision_rf:.2f}")
print(f"Recall: {recall_rf:.2f}")
print(f"F1 Score: {f1_rf:.2f}")
print(f"ROC-AUC Score: {roc_auc_rf:.2f}")

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Fitting 5 folds for each of 6 candidates, totalling 30 fits
XGBoost Metrics:
Accuracy: 70.94%
Precision: 0.78
Recall: 0.72
F1 Score: 0.75
ROC-AUC Score: 0.71

Random Forest Metrics:
Accuracy: 71.70%
Precision: 0.78
Recall: 0.74
F1 Score: 0.76
ROC-AUC Score: 0.71
