In [6]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import LabelEncoder
import joblib





In [7]:
# Load dataset
df = pd.read_csv("C:\\Users\\LENOVO\\OneDrive\\Desktop\\ad-campaign-analytics\\data\\online_advertising_performance_data.csv")

# Target column
target = 'post_click_conversions'

# Drop missing target values
df = df.dropna(subset=[target])

# Encode categorical variables
le_campaign = LabelEncoder()
le_placement = LabelEncoder()
le_banner = LabelEncoder()

df['campaign_enc'] = le_campaign.fit_transform(df['campaign_number'].astype(str))
df['placement_enc'] = le_placement.fit_transform(df['placement'].astype(str))
df['banner_enc'] = le_banner.fit_transform(df['banner'].astype(str))

# Encode user_engagement
le_engage = LabelEncoder()
df['engagement_enc'] = le_engage.fit_transform(df['user_engagement'].astype(str))

# Define features and target
X = df[['campaign_enc', 'placement_enc', 'banner_enc', 'revenue', 'cost', 'clicks', 'engagement_enc']]

# Encode target variable (in case it's not already numeric or cleanly encoded)
le_target = LabelEncoder()
y = le_target.fit_transform(df[target])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train RandomForest model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Evaluate model
y_pred = clf.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Get predicted probabilities
y_prob = clf.predict_proba(X_test)

# ROC-AUC Score for multi-class
roc_auc = roc_auc_score(y_test, y_prob, multi_class='ovr')
print("ROC-AUC Score:", roc_auc)

# Save model
joblib.dump(clf, "post_click_rf_model.joblib")


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.94      0.90      1953
           1       0.25      0.17      0.20       217
           2       0.14      0.11      0.12       110
           3       0.19      0.17      0.18        71
           4       0.03      0.02      0.02        50
           5       0.03      0.02      0.02        47
           6       0.12      0.09      0.10        46
           7       0.00      0.00      0.00        30
           8       0.05      0.04      0.04        25
           9       0.00      0.00      0.00        15
          10       0.00      0.00      0.00        22
          11       0.00      0.00      0.00        20
          12       0.00      0.00      0.00         7
          13       0.00      0.00      0.00        16
          14       0.00      0.00      0.00         8
          15       0.00      0.00      0.00        11
          16       0.12      0.10      0.11        10
   

ValueError: Number of classes in y_true not equal to the number of columns in 'y_score'

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, f1_score
import joblib
import warnings
warnings.filterwarnings('ignore')

def load_and_validate_data(file_path):
    """Load and validate the dataset"""
    try:
        df = pd.read_csv(file_path)
        print(f"Dataset loaded successfully. Shape: {df.shape}")
        print(f"Columns: {df.columns.tolist()}")
        return df
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found. Please check the file path.")
        return None
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

def preprocess_data(df):
    """Preprocess and clean the data"""
    # Define expected columns with fallbacks
    column_mapping = {
        'target': ['post_click_conversions', 'post_click', 'conversion', 'converted'],
        'campaign': ['campaign_number', 'campaign', 'campaign_id'],
        'placement': ['placement', 'placement_id'],
        'banner': ['banner', 'banner_size', 'banner_id'],
        'revenue': ['revenue', 'total_revenue'],
        'cost': ['cost', 'total_cost'],
        'clicks': ['clicks', 'total_clicks'],
        'engagement': ['user_engagement', 'engagement', 'user_enga']
    }
    
    # Find actual column names
    actual_columns = {}
    for key, possible_names in column_mapping.items():
        for name in possible_names:
            if name in df.columns:
                actual_columns[key] = name
                break
        if key not in actual_columns:
            print(f"Warning: No column found for {key}. Available columns: {df.columns.tolist()}")
    
    # Check if target column exists
    if 'target' not in actual_columns:
        print("Error: Target column not found!")
        return None, None, None, None, None
    
    target_col = actual_columns['target']
    
    # Handle missing values in target
    initial_rows = len(df)
    df = df.dropna(subset=[target_col])
    print(f"Dropped {initial_rows - len(df)} rows with missing target values")
    
    # Convert target to binary if needed
    if df[target_col].dtype == 'object':
        df[target_col] = df[target_col].astype(str).str.lower()
        df[target_col] = df[target_col].map({'yes': 1, 'true': 1, '1': 1, 'no': 0, 'false': 0, '0': 0})
    
    # Ensure target is binary
    unique_values = df[target_col].unique()
    if len(unique_values) > 2:
        print(f"Warning: Target has more than 2 unique values: {unique_values}")
        # Convert to binary (1 if > 0, else 0)
        df[target_col] = (df[target_col] > 0).astype(int)
    
    # Encode categorical variables
    encoders = {}
    feature_columns = []
    
    for key in ['campaign', 'placement', 'banner']:
        if key in actual_columns:
            col = actual_columns[key]
            # Handle missing values in categorical columns
            df[col] = df[col].fillna('unknown')
            
            # Create encoder
            encoder = LabelEncoder()
            encoded_col = f"{key}_enc"
            df[encoded_col] = encoder.fit_transform(df[col].astype(str))
            encoders[key] = encoder
            feature_columns.append(encoded_col)
            print(f"Encoded {col} -> {encoded_col} ({len(encoder.classes_)} unique values)")
    
    # Handle numeric features
    numeric_features = []
    for key in ['revenue', 'cost', 'clicks', 'engagement']:
        if key in actual_columns:
            col = actual_columns[key]
            # Convert to numeric and handle missing values
            df[col] = pd.to_numeric(df[col], errors='coerce')
            df[col] = df[col].fillna(df[col].median())
            numeric_features.append(col)
            feature_columns.append(col)
    
    # Create derived features
    if 'cost' in actual_columns and 'clicks' in actual_columns:
        cost_col = actual_columns['cost']
        clicks_col = actual_columns['clicks']
        # Cost per click (handle division by zero)
        df['cpc'] = df[cost_col] / df[clicks_col].replace(0, np.nan)
        df['cpc'] = df['cpc'].fillna(df['cpc'].median())
        feature_columns.append('cpc')
        print("Added derived feature: cpc (cost per click)")
    
    if 'revenue' in actual_columns and 'cost' in actual_columns:
        revenue_col = actual_columns['revenue']
        cost_col = actual_columns['cost']
        # Return on investment
        df['roi'] = (df[revenue_col] - df[cost_col]) / df[cost_col].replace(0, np.nan)
        df['roi'] = df['roi'].fillna(df['roi'].median())
        feature_columns.append('roi')
        print("Added derived feature: roi (return on investment)")
    
    # Final feature selection
    available_features = [col for col in feature_columns if col in df.columns]
    X = df[available_features]
    y = df[target_col].astype(int)
    
    print(f"Final feature set: {available_features}")
    print(f"Target distribution: {y.value_counts().to_dict()}")
    
    return X, y, encoders, available_features, df

def train_and_evaluate_model(X, y):
    """Train and evaluate the Random Forest model"""
    # Check class distribution
    class_counts = y.value_counts()
    print(f"Class distribution: {class_counts.to_dict()}")
    
    if len(class_counts) < 2:
        print("Error: Only one class present in target variable!")
        return None
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    print(f"Training set size: {X_train.shape[0]}")
    print(f"Test set size: {X_test.shape[0]}")
    
    # Train model with class balancing
    clf = RandomForestClassifier(
        n_estimators=100,
        random_state=42,
        class_weight='balanced',  # Handle class imbalance
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=2
    )
    
    clf.fit(X_train, y_train)
    
    # Predictions
    y_pred = clf.predict(X_test)
    y_pred_proba = clf.predict_proba(X_test)
    
    # Evaluation metrics
    print("\n" + "="*50)
    print("MODEL EVALUATION RESULTS")
    print("="*50)
    
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"F1-Score: {f1_score(y_test, y_pred):.4f}")
    
    if y_pred_proba.shape[1] > 1:  # Binary classification
        print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba[:, 1]):.4f}")
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': clf.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nTop 10 Feature Importances:")
    print(feature_importance.head(10))
    
    return clf, feature_importance

def main():
    """Main execution function"""
    # Load data
    df = load_and_validate_data("C:\\Users\\LENOVO\\OneDrive\\Desktop\\ad-campaign-analytics\\data\\online_advertising_performance_data.csv")
    if df is None:
        print("Please ensure your dataset file exists and is accessible.")
        return
    
    # Preprocess data
    X, y, encoders, feature_names, processed_df = preprocess_data(df)
    if X is None:
        print("Data preprocessing failed. Please check your dataset structure.")
        return
    
    # Train and evaluate model
    model = train_and_evaluate_model(X, y)
    if model is None:
        print("Model training failed.")
        return
    
    clf, feature_importance = model
    
    # Save model and encoders
    try:
        joblib.dump({
            'model': clf,
            'encoders': encoders,
            'feature_names': feature_names
        }, "post_click_rf_model.joblib")
        print("\nModel saved successfully as 'post_click_rf_model.joblib'")
        
        # Save feature importance
        feature_importance.to_csv("feature_importance.csv", index=False)
        print("Feature importance saved as 'feature_importance.csv'")
        
    except Exception as e:
        print(f"Error saving model: {e}")

if __name__ == "__main__":
    main()