In [None]:
import pandas as pd

# 1. Load the dataset
df = pd.read_csv("/content/data.csv")

# 2. Remove leading/trailing spaces from column names
df.columns = df.columns.str.strip()

# 3. Drop duplicated columns (keep the first occurrence)
df = df.loc[:, ~df.columns.duplicated()]

# 4. Drop 'Timestamp' if it exists
df.drop(columns=["Timestamp"], inplace=True, errors='ignore')

# 5. Display missing values
# print("Missing Values Before Cleaning:")
# print(df.isnull().sum())

# 6. Clean columns individually
for col in df.columns:
    col_dtype = df[col].dtype

    # Categorical / object columns
    if col_dtype == object:
        # Fill NaNs with mode and ensure string format
        df[col] = df[col].fillna(df[col].mode()[0]).astype(str).str.strip().str.lower()
    else:
        # Convert to numeric safely
        df[col] = pd.to_numeric(df[col], errors='coerce')
        # Fill numeric NaNs with median
        df[col] = df[col].fillna(df[col].median())

# 7. Verify missing values are handled
# print("\nMissing Values After Cleaning:")
# print(df.isnull().sum())

# 8. Dataset overview after cleaning
# print("\nDataset Overview:")
# print(df.info())

# # 9. Display first few rows
# print("\nFirst few rows:")
# print(df.head())

from sklearn.preprocessing import StandardScaler

# ------------------------------------------
# (1) Create a binary target column
# ------------------------------------------
cart_completion_map = {
    "never": 1,
    "rarely": 1,
    "sometimes": 1,
    "often": 0,
    "always": 0
}
df["Cart_Abandoned"] = df["Cart_Completion_Frequency"].map(cart_completion_map)

# ------------------------------------------
# (2) One-hot encode nominal features
# ------------------------------------------
nominal_cols = [
    "Gender", "Product_Search_Method", "Browsing_Frequency",
    "Purchase_Frequency", "Search_Result_Exploration", "Add_to_Cart_Browsing",
    "Saveforlater_Frequency", "Review_Left", "Review_Helpfulness",
    "Recommendation_Helpfulness", "Service_Appreciation"
]

df_encoded = pd.get_dummies(df, columns=nominal_cols, drop_first=True)

# ------------------------------------------
# (3) Scale numeric features
# ------------------------------------------
numeric_cols = ["age", "Customer_Reviews_Importance", "Rating_Accuracy", "Shopping_Satisfaction"]
scaler = StandardScaler()
df_encoded[numeric_cols] = scaler.fit_transform(df_encoded[numeric_cols])

# Preview
# print("Processed Data:\n", df_encoded.head())
# print("\nColumns:", df_encoded.columns)

import pandas as pd
from sklearn.model_selection import train_test_split

# Map categories to binary (1: Abandoned, 0: Completed)
completion_mapping = {
    "never": 1,
    "rarely": 1,
    "sometimes": 1,
    "often": 0,
    "always": 0
}

df["Cart_Abandoned"] = df["Cart_Completion_Frequency"].map(completion_mapping)

# Verify mapping
print(df["Cart_Abandoned"].value_counts())

# Prepare features (X) and target (y)
X = df.drop(columns=["Cart_Completion_Frequency", "Cart_Abandoned", "Cart_Abandonment_Factors", "Improvement_Areas"])
y = df["Cart_Abandoned"]

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd

# Step 1: Encoding categorical features using One-Hot encoding
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
X_train_encoded = pd.get_dummies(X_train, columns=categorical_features, drop_first=True)
X_test_encoded = pd.get_dummies(X_test, columns=categorical_features, drop_first=True)

# Align the test set to ensure same columns as training
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

# Step 2: Scaling numeric features
scaler = StandardScaler()
numeric_cols = X_train_encoded.select_dtypes(include=['int64', 'float64']).columns

X_train_encoded[numeric_cols] = scaler.fit_transform(X_train_encoded[numeric_cols])
X_test_encoded[numeric_cols] = scaler.transform(X_test_encoded[numeric_cols])

# Step 3: Training the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_encoded, y_train)

# Step 4: Making predictions
y_pred = model.predict(X_test_encoded)

# Step 5: Model evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

from sklearn.model_selection import GridSearchCV

# Define parameter grid for tuning
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize GridSearchCV with RandomForestClassifier
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    verbose=2
)

# Fit the model
grid_search.fit(X_train_encoded, y_train)

# Best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Evaluate the optimized model
best_model = grid_search.best_estimator_
y_pred_optimized = best_model.predict(X_test_encoded)

print("\nOptimized Model Accuracy:", accuracy_score(y_test, y_pred_optimized))
print("\nOptimized Classification Report:\n", classification_report(y_test, y_pred_optimized))



