In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import warnings
warnings.filterwarnings('ignore')

# Load dataset
df = pd.read_csv("data.csv")
df.columns = df.columns.str.strip()

print(f"Initial dataset shape: {df.shape}")
print(f"Column names: {list(df.columns)}")

# 1. Check duplicates
duplicate_count = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicate_count}")
if duplicate_count > 0:
    df.drop_duplicates(inplace=True)
    print(f"Dataset shape after removing duplicates: {df.shape}")

# 2. Check missing values
missing_counts = df.isnull().sum()
print(f"\nMissing values per column before any processing:")
for col, count in missing_counts.items():
    if count > 0:
        print(f"  {col}: {count}")
if missing_counts.sum() == 0:
    print("  No missing values found!")

# 3. Fill missing values (if any exist)

# Define column types
numeric_cols = ['recency', 'history']
categorical_cols = ['zip_code', 'channel', 'offer']

# Check if numeric columns exist and have missing values
for col in numeric_cols:
    if col in df.columns and df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].median())
        print(f"Filled missing values in {col} with median")

# Check if categorical columns exist and have missing values
for col in categorical_cols:
    if col in df.columns and df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].mode()[0])
        print(f"Filled missing values in {col} with mode")

# Ensure numeric types for numeric columns
for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

print(f"\nMissing values per column after filling:")
missing_after_fill = df.isnull().sum()
for col, count in missing_after_fill.items():
    if count > 0:
        print(f"  {col}: {count}")
if missing_after_fill.sum() == 0:
    print("  No missing values!")

# 4. Create binary treatment variable
if 'offer' in df.columns:
    df['treatment'] = df['offer'].apply(lambda x: 0 if str(x).strip().lower() == 'no offer' else 1)
    print(f"\nTreatment variable created:")
    print(f"  Treatment=0 (No offer): {(df['treatment'] == 0).sum()}")
    print(f"  Treatment=1 (Offer): {(df['treatment'] == 1).sum()}")


# 5. One-hot encode multi-category variables
# Check which multi-category columns actually exist
multi_cat_cols = [col for col in ['zip_code', 'channel'] if col in df.columns]

if multi_cat_cols:
    print(f"\nOne-hot encoding columns: {multi_cat_cols}")

    # Reset index to ensure proper alignment
    df.reset_index(drop=True, inplace=True)

    # Create encoder and fit_transform
    encoder = OneHotEncoder(drop='first', sparse_output=False)
    encoded_array = encoder.fit_transform(df[multi_cat_cols])

    # Create DataFrame with proper index alignment
    encoded_df = pd.DataFrame(
        encoded_array,
        columns=encoder.get_feature_names_out(multi_cat_cols),
        index=df.index  # Ensure same index as original df
    )

    print(f"  Encoded features shape: {encoded_df.shape}")
    print(f"  Encoded feature names: {list(encoded_df.columns)}")

    # Concatenate with original df (excluding the original multi-category columns)
    df_final = pd.concat([df.drop(columns=multi_cat_cols), encoded_df], axis=1)

    print(f"  Final dataset shape after encoding: {df_final.shape}")
else:
    df_final = df.copy()
    print("\nNo multi-category columns found for one-hot encoding")

# Check for missing values after one-hot encoding
missing_after_encoding = df_final.isnull().sum()
print(f"\nMissing values per column after one-hot encoding:")
total_missing = 0
for col, count in missing_after_encoding.items():
    if count > 0:
        print(f"  {col}: {count}")
        total_missing += count

if total_missing == 0:
    print("  No missing values after one-hot encoding!")
else:
    print(f"\nTotal missing values: {total_missing}")

    # Drop rows with any remaining missing values only if they exist
    initial_rows = df_final.shape[0]
    df_final.dropna(inplace=True)
    rows_dropped = initial_rows - df_final.shape[0]
    print(f"Dropped {rows_dropped} rows with missing values after one-hot encoding.")

print(f"\nFinal dataset shape: {df_final.shape}")

# Logistic Regression Assumption Checks
print("\n" + "="*50)
print("LOGISTIC REGRESSION ASSUMPTION CHECKS")
print("="*50)

# Prepare X and y
y = df_final['conversion'] # Outcome variable
X = df_final.drop(columns=['conversion', 'offer']) # Drop outcome and original 'offer'
X_const = sm.add_constant(X) # Add intercept column


# 1. Multicollinearity (VIF)

print(f"\n1. MULTICOLLINEARITY CHECK (VIF)")
print("-" * 40)

try:
    vif_data = pd.DataFrame()
    vif_data["feature"] = X_const.columns
    vif_data["VIF"] = [variance_inflation_factor(X_const.values, i) for i in range(X_const.shape[1])]

    print("Variance Inflation Factors (VIF):")
    for _, row in vif_data.iterrows():
        status = "⚠ High" if row['VIF'] > 10 else "✅ OK"
        print(f"  {row['feature']}: {row['VIF']:.2f} {status}")

except Exception as e:
    print(f"Error calculating VIF: {e}")


# 2. Linearity of logit for numeric predictors (Box-Tidwell)

print(f"\n2. LINEARITY CHECK (Box-Tidwell)")
print("-" * 40)

# Only test numeric columns that actually exist
existing_numeric_cols = [col for col in numeric_cols if col in X.columns]

if existing_numeric_cols:
    try:
        df_bt = df_final.copy()
        log_terms = []

        for col in existing_numeric_cols:
            # Add small constant to avoid log(0) and handle negative values
            col_log = col + '_log'
            df_bt[col_log] = df_bt[col] * np.log(df_bt[col] + abs(df_bt[col].min()) + 1)
            log_terms.append(col_log)

        # Create feature matrix for Box-Tidwell test
        bt_features = existing_numeric_cols + log_terms
        X_bt = sm.add_constant(df_bt[bt_features])

        bt_test_model = sm.Logit(y, X_bt)
        bt_result = bt_test_model.fit(disp=False)

        print("Box-Tidwell test p-values (linearity check):")
        for feature, pval in bt_result.pvalues.items():
            if feature in log_terms:
                original_col = feature.replace('_log', '')
                status = "⚠ Non-linear" if pval < 0.05 else "✅ Linear"
                print(f"  {original_col}: p={pval:.4f} {status}")

    except Exception as e:
        print(f"Error in Box-Tidwell test: {e}")
else:
    print("No numeric predictors found for linearity testing.")

# 3. Large enough sample size (EPV rule)

print(f"\n3. SAMPLE SIZE CHECK (EPV Rule)")
print("-" * 40)

num_events = y.sum()  # number of 1's (conversions)
num_predictors = X_const.shape[1] - 1  # exclude intercept
epv = num_events / num_predictors if num_predictors > 0 else 0

print(f"Number of events (conversions): {num_events}")
print(f"Number of predictors: {num_predictors}")
print(f"EPV (Events Per Variable): {epv:.2f}")

if epv < 10:
    print("⚠ Warning: EPV < 10 → Model may be overfitted.")
else:
    print("✅ EPV rule satisfied.")

# Save outputs
print(f"\n" + "="*50)
print("SAVING OUTPUTS")
print("="*50)

try:
    df_final.to_csv("preprocessed_dataset.csv", index=False)
    print("✅ Preprocessed dataset saved as 'preprocessed_dataset.csv'")

    if 'conversion' in df_final.columns:
        X_const.to_csv("X_ready.csv", index=False)
        y.to_csv("y_ready.csv", index=False, header=['conversion'])
        print("✅ Feature matrix (X) saved as 'X_ready.csv'")
        print("✅ Target variable (y) saved as 'y_ready.csv'")

except Exception as e:
    print(f"Error saving files: {e}")

print(f"\n✅ Preprocessing and assumption checks complete!")
print(f"Final dataset summary:")
print(f"  Shape: {df_final.shape}")
print(f"  Columns: {list(df_final.columns)}")

Initial dataset shape: (64000, 9)
Column names: ['recency', 'history', 'used_discount', 'used_bogo', 'zip_code', 'is_referral', 'channel', 'offer', 'conversion']

Number of duplicate rows: 6603
Dataset shape after removing duplicates: (57397, 9)

Missing values per column before any processing:
  No missing values found!

Missing values per column after filling:
  No missing values!

Treatment variable created:
  Treatment=0 (No offer): 19072
  Treatment=1 (Offer): 38325

One-hot encoding columns: ['zip_code', 'channel']
  Encoded features shape: (57397, 4)
  Encoded feature names: ['zip_code_Surburban', 'zip_code_Urban', 'channel_Phone', 'channel_Web']
  Final dataset shape after encoding: (57397, 12)

Missing values per column after one-hot encoding:
  No missing values after one-hot encoding!

Final dataset shape: (57397, 12)

LOGISTIC REGRESSION ASSUMPTION CHECKS

1. MULTICOLLINEARITY CHECK (VIF)
----------------------------------------
Variance Inflation Factors (VIF):
  const: 36