In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Machine Learning
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import lightgbm as lgb
import xgboost as xgb

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

class ChurnPredictionModel:
    def __init__(self):
        self.models = {}
        self.feature_importance = {}
        self.scaler = StandardScaler()
        self.label_encoders = {}
        
    def load_data(self):
        """Load all datasets"""
        print("Loading datasets...")
        
        # Load all datasets
        self.train_data = pd.read_csv('/kaggle/input/customer-retention-datathon-apac-edition/train_data.csv')
        self.members = pd.read_csv('/kaggle/input/customer-retention-datathon-apac-edition/members.csv')
        self.transactions = pd.read_csv('/kaggle/input/customer-retention-datathon-apac-edition/transactions.csv')
        self.user_logs = pd.read_csv('/kaggle/input/customer-retention-datathon-apac-edition/user_logs.csv')
        self.test_data = pd.read_csv('/kaggle/input/customer-retention-datathon-apac-edition/kaggle_test_data.csv')
        
        print(f"Train data shape: {self.train_data.shape}")
        print(f"Members data shape: {self.members.shape}")
        print(f"Transactions data shape: {self.transactions.shape}")
        print(f"User logs data shape: {self.user_logs.shape}")
        print(f"Test data shape: {self.test_data.shape}")
        
    def preprocess_data(self):
        """Clean and preprocess all datasets"""
        print("Preprocessing data...")
        
        # Clean members data
        self.members['bd'] = self.members['bd'].apply(lambda x: x if 0 <= x <= 100 else np.nan)
        self.members['registration_init_time'] = pd.to_datetime(self.members['registration_init_time'], format='%Y%m%d')
        
        # Clean transactions data
        self.transactions['transaction_date'] = pd.to_datetime(self.transactions['transaction_date'], format='%Y%m%d')
        self.transactions['membership_expire_date'] = pd.to_datetime(self.transactions['membership_expire_date'], format='%Y%m%d')
        
        # Clean user logs data
        self.user_logs['date'] = pd.to_datetime(self.user_logs['date'], format='%Y%m%d')
        
    def engineer_features(self, data_type='train'):
        """Engineer features from all data sources"""
        print(f"Engineering features for {data_type} data...")
        
        if data_type == 'train':
            base_data = self.train_data.copy()
        else:
            base_data = self.test_data.copy()
            
        # Start with base data
        features_df = base_data.copy()
        
        # 1. Member features
        member_features = self.create_member_features()
        features_df = features_df.merge(member_features, on='msno', how='left')
        
        # 2. Transaction features
        transaction_features = self.create_transaction_features()
        features_df = features_df.merge(transaction_features, on='msno', how='left')
        
        # 3. User behavior features
        behavior_features = self.create_behavior_features()
        features_df = features_df.merge(behavior_features, on='msno', how='left')
        
        return features_df
    
    def create_member_features(self):
        """Create features from members data"""
        member_features = self.members.copy()
        
        # Age features
        member_features['age'] = member_features['bd']
        member_features['age_group'] = pd.cut(member_features['age'], 
                                            bins=[0, 18, 25, 35, 45, 55, 100], 
                                            labels=['<18', '18-25', '25-35', '35-45', '45-55', '55+'])
        
        # Registration features
        member_features['registration_year'] = member_features['registration_init_time'].dt.year
        member_features['registration_month'] = member_features['registration_init_time'].dt.month
        member_features['days_since_registration'] = (pd.Timestamp('2017-03-31') - member_features['registration_init_time']).dt.days
        
        return member_features[['msno', 'city', 'age', 'age_group', 'gender', 'registered_via', 
                               'registration_year', 'registration_month', 'days_since_registration']]
    
    def create_transaction_features(self):
        """Create features from transaction data"""
        # Get latest transaction for each user
        latest_transactions = self.transactions.sort_values('transaction_date').groupby('msno').tail(1)
        
        # Aggregate transaction features
        transaction_agg = self.transactions.groupby('msno').agg({
            'payment_method_id': ['nunique', 'last'],
            'payment_plan_days': ['mean', 'std', 'last'],
            'plan_list_price': ['mean', 'std', 'last'],
            'actual_amount_paid': ['mean', 'std', 'sum', 'last'],
            'is_auto_renew': ['last', 'mean'],
            'is_cancel': ['sum', 'mean', 'last'],
            'transaction_date': ['count', 'last']
        }).round(2)
        
        # Flatten column names
        transaction_agg.columns = ['_'.join(col).strip() for col in transaction_agg.columns]
        transaction_agg = transaction_agg.reset_index()
        
        # Add features from latest transaction
        latest_features = latest_transactions[['msno', 'membership_expire_date']].copy()
        latest_features['days_to_expire'] = (latest_features['membership_expire_date'] - pd.Timestamp('2017-03-31')).dt.days
        
        # Merge features
        transaction_features = transaction_agg.merge(latest_features, on='msno', how='left')
        
        # Calculate discount rate
        transaction_features['discount_rate'] = (
            transaction_features['plan_list_price_last'] - transaction_features['actual_amount_paid_last']
        ) / transaction_features['plan_list_price_last']
        
        return transaction_features
    
    def create_behavior_features(self):
        """Create features from user logs data"""
        # Calculate behavior metrics for each user
        behavior_agg = self.user_logs.groupby('msno').agg({
            'num_25': ['sum', 'mean', 'std'],
            'num_50': ['sum', 'mean', 'std'],
            'num_75': ['sum', 'mean', 'std'],
            'num_985': ['sum', 'mean', 'std'],
            'num_100': ['sum', 'mean', 'std'],
            'num_unq': ['sum', 'mean', 'std', 'max'],
            'total_secs': ['sum', 'mean', 'std', 'max'],
            'date': ['count']  # number of active days
        }).round(2)
        
        # Flatten column names
        behavior_agg.columns = ['_'.join(col).strip() for col in behavior_agg.columns]
        behavior_agg = behavior_agg.reset_index()
        
        # Calculate derived features
        total_songs = (behavior_agg['num_25_sum'] + behavior_agg['num_50_sum'] + 
                      behavior_agg['num_75_sum'] + behavior_agg['num_985_sum'] + 
                      behavior_agg['num_100_sum'])
        
        behavior_agg['total_songs'] = total_songs
        behavior_agg['completion_rate'] = behavior_agg['num_100_sum'] / (total_songs + 1)
        behavior_agg['skip_rate'] = behavior_agg['num_25_sum'] / (total_songs + 1)
        behavior_agg['engagement_score'] = (behavior_agg['num_75_sum'] + behavior_agg['num_985_sum'] + 
                                           behavior_agg['num_100_sum']) / (total_songs + 1)
        behavior_agg['avg_daily_songs'] = total_songs / (behavior_agg['date_count'] + 1)
        behavior_agg['avg_daily_listening_time'] = behavior_agg['total_secs_sum'] / (behavior_agg['date_count'] + 1)
        
        return behavior_agg
    
    def prepare_model_data(self, features_df):
        """Prepare data for modeling"""
        # Encode categorical variables
        categorical_cols = ['city', 'age_group', 'gender', 'registered_via']
        
        for col in categorical_cols:
            if col in features_df.columns:
                if col not in self.label_encoders:
                    self.label_encoders[col] = LabelEncoder()
                    features_df[col] = self.label_encoders[col].fit_transform(features_df[col].astype(str))
                else:
                    features_df[col] = self.label_encoders[col].transform(features_df[col].astype(str))
        
        # Fill missing values
        numeric_cols = features_df.select_dtypes(include=[np.number]).columns
        features_df[numeric_cols] = features_df[numeric_cols].fillna(features_df[numeric_cols].median())
        
        return features_df
    
    def train_models(self, X_train, y_train, X_val, y_val):
        """Train multiple models"""
        print("Training models...")
        
        # Define models
        models = {
            'random_forest': RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1),
            'gradient_boosting': GradientBoostingClassifier(n_estimators=200, random_state=42),
            'logistic_regression': LogisticRegression(random_state=42),
            'lightgbm': lgb.LGBMClassifier(random_state=42, n_jobs=-1),
            'xgboost': xgb.XGBClassifier(random_state=42, n_jobs=-1)
        }
        
        results = {}
        
        for name, model in models.items():
            print(f"Training {name}...")
            
            # Train model
            model.fit(X_train, y_train)
            
            # Predict on validation set
            y_pred = model.predict(X_val)
            y_pred_proba = model.predict_proba(X_val)[:, 1]
            
            # Calculate metrics
            auc_score = roc_auc_score(y_val, y_pred_proba)
            
            results[name] = {
                'model': model,
                'auc_score': auc_score,
                'predictions': y_pred,
                'probabilities': y_pred_proba
            }
            
            print(f"{name} AUC: {auc_score:.4f}")
            
            # Store feature importance for tree-based models
            if hasattr(model, 'feature_importances_'):
                self.feature_importance[name] = model.feature_importances_
        
        self.models = results
        return results
    
    def create_ensemble(self, X_val, y_val):
        """Create ensemble prediction"""
        print("Creating ensemble...")
        
        # Weight models by their AUC scores
        total_auc = sum([result['auc_score'] for result in self.models.values()])
        weights = {name: result['auc_score'] / total_auc for name, result in self.models.items()}
        
        # Create weighted ensemble
        ensemble_proba = np.zeros(len(X_val))
        for name, result in self.models.items():
            ensemble_proba += weights[name] * result['probabilities']
        
        ensemble_auc = roc_auc_score(y_val, ensemble_proba)
        print(f"Ensemble AUC: {ensemble_auc:.4f}")
        
        self.ensemble_weights = weights
        return ensemble_proba
    
    def plot_feature_importance(self, model_name='lightgbm', top_n=20):
        """Plot feature importance"""
        if model_name in self.feature_importance:
            feature_names = self.feature_names
            importance = self.feature_importance[model_name]
            
            # Get top features
            feature_importance_df = pd.DataFrame({
                'feature': feature_names,
                'importance': importance
            }).sort_values('importance', ascending=False).head(top_n)
            
            plt.figure(figsize=(10, 8))
            sns.barplot(data=feature_importance_df, x='importance', y='feature')
            plt.title(f'Top {top_n} Feature Importance - {model_name.upper()}')
            plt.tight_layout()
            plt.show()
    
    def generate_predictions(self, test_features):
        """Generate final predictions for test set"""
        print("Generating final predictions...")
        
        # Use ensemble approach
        ensemble_proba = np.zeros(len(test_features))
        
        for name, weight in self.ensemble_weights.items():
            model = self.models[name]['model']
            proba = model.predict_proba(test_features)[:, 1]
            ensemble_proba += weight * proba
        
        return ensemble_proba
    
    def run_complete_pipeline(self):
        """Run the complete modeling pipeline"""
        print("=== CHURN PREDICTION MODEL PIPELINE ===")
        
        # Load and preprocess data
        self.load_data()
        self.preprocess_data()
        
        # Engineer features
        train_features = self.engineer_features('train')
        test_features = self.engineer_features('test')
        
        # Prepare model data
        train_features = self.prepare_model_data(train_features)
        test_features = self.prepare_model_data(test_features)
        
        # Separate features and target
        feature_cols = [col for col in train_features.columns if col not in ['msno', 'is_churn']]
        self.feature_names = feature_cols
        
        X = train_features[feature_cols]
        y = train_features['is_churn']
        X_test = test_features[feature_cols]
        
        # Split training data
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, 
                                                          random_state=42, stratify=y)
        
        print(f"Training set: {X_train.shape}")
        print(f"Validation set: {X_val.shape}")
        print(f"Test set: {X_test.shape}")
        print(f"Churn rate in training: {y_train.mean():.3f}")
        
        # Train models
        results = self.train_models(X_train, y_train, X_val, y_val)
        
        # Create ensemble
        ensemble_proba = self.create_ensemble(X_val, y_val)
        
        # Generate test predictions
        test_predictions = self.generate_predictions(X_test)
        
        # Create submission file
        submission = pd.DataFrame({
            'msno': test_features['msno'],
            'is_churn': test_predictions
        })
        submission.to_csv('submission.csv', index=False)
        print("Submission file saved as 'submission.csv'")
        
        return submission

# Run the model
if __name__ == "__main__":
    model = ChurnPredictionModel()
    
    # Run complete pipeline
    submission = model.run_complete_pipeline()
    
    # Plot feature importance
    model.plot_feature_importance()
    
    print("Model training completed!")
    print(f"Final submission shape: {submission.shape}")
    print("\nSample predictions:")
    print(submission.head(10))

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Machine Learning
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import lightgbm as lgb
import xgboost as xgb

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

class ChurnPredictionModel:
    def __init__(self):
        self.models = {}
        self.feature_importance = {}
        self.scaler = StandardScaler()
        self.label_encoders = {}
        
    def load_data(self):
        """Load all datasets"""
        print("Loading datasets...")
        
        # Load all datasets
        self.train_data = pd.read_csv('/kaggle/input/customer-retention-datathon-apac-edition/train_data.csv')
        self.members = pd.read_csv('/kaggle/input/customer-retention-datathon-apac-edition/members.csv')
        self.transactions = pd.read_csv('/kaggle/input/customer-retention-datathon-apac-edition/transactions.csv')
        self.user_logs = pd.read_csv('/kaggle/input/customer-retention-datathon-apac-edition/user_logs.csv')
        self.test_data = pd.read_csv('/kaggle/input/customer-retention-datathon-apac-edition/kaggle_test_data.csv')
        
        print(f"Train data shape: {self.train_data.shape}")
        print(f"Members data shape: {self.members.shape}")
        print(f"Transactions data shape: {self.transactions.shape}")
        print(f"User logs data shape: {self.user_logs.shape}")
        print(f"Test data shape: {self.test_data.shape}")
        
        # Basic data validation
        print(f"\nChurn rate in training data: {self.train_data['is_churn'].mean():.3f}")
        print(f"Unique users in train: {self.train_data['msno'].nunique()}")
        print(f"Unique users in test: {self.test_data['msno'].nunique()}")
        print(f"Unique users in members: {self.members['msno'].nunique()}")
        print(f"Unique users in transactions: {self.transactions['msno'].nunique()}")
        print(f"Unique users in user_logs: {self.user_logs['msno'].nunique()}")
        
    def preprocess_data(self):
        """Clean and preprocess all datasets"""
        print("Preprocessing data...")
        
        # Clean members data
        self.members['bd'] = self.members['bd'].apply(lambda x: x if 0 <= x <= 100 else np.nan)
        self.members['registration_init_time'] = pd.to_datetime(self.members['registration_init_time'], format='%Y%m%d')
        
        # Clean transactions data
        self.transactions['transaction_date'] = pd.to_datetime(self.transactions['transaction_date'], format='%Y%m%d')
        self.transactions['membership_expire_date'] = pd.to_datetime(self.transactions['membership_expire_date'], format='%Y%m%d')
        
        # Clean user logs data
        self.user_logs['date'] = pd.to_datetime(self.user_logs['date'], format='%Y%m%d')
        
    def engineer_features(self, data_type='train'):
        """Engineer features from all data sources"""
        print(f"Engineering features for {data_type} data...")
        
        if data_type == 'train':
            base_data = self.train_data.copy()
        else:
            base_data = self.test_data.copy()
            
        # Start with base data
        features_df = base_data.copy()
        
        # 1. Member features
        member_features = self.create_member_features()
        features_df = features_df.merge(member_features, on='msno', how='left')
        
        # 2. Transaction features
        transaction_features = self.create_transaction_features()
        features_df = features_df.merge(transaction_features, on='msno', how='left')
        
        # 3. User behavior features
        behavior_features = self.create_behavior_features()
        features_df = features_df.merge(behavior_features, on='msno', how='left')
        
        return features_df
    
    def create_member_features(self):
        """Create features from members data"""
        member_features = self.members.copy()
        
        # Age features
        member_features['age'] = member_features['bd']
        member_features['age_group'] = pd.cut(member_features['age'], 
                                            bins=[0, 18, 25, 35, 45, 55, 100], 
                                            labels=['<18', '18-25', '25-35', '35-45', '45-55', '55+'])
        
        # Registration features
        member_features['registration_year'] = member_features['registration_init_time'].dt.year
        member_features['registration_month'] = member_features['registration_init_time'].dt.month
        member_features['days_since_registration'] = (pd.Timestamp('2017-03-31') - member_features['registration_init_time']).dt.days
        
        return member_features[['msno', 'city', 'age', 'age_group', 'gender', 'registered_via', 
                               'registration_year', 'registration_month', 'days_since_registration']].copy()
    
    def create_transaction_features(self):
        """Create features from transaction data"""
        # Get latest transaction for each user
        latest_transactions = self.transactions.sort_values('transaction_date').groupby('msno').tail(1)
        
        # Aggregate transaction features
        transaction_agg = self.transactions.groupby('msno').agg({
            'payment_method_id': ['nunique', 'last'],
            'payment_plan_days': ['mean', 'std', 'last'],
            'plan_list_price': ['mean', 'std', 'last'],
            'actual_amount_paid': ['mean', 'std', 'sum', 'last'],
            'is_auto_renew': ['last', 'mean'],
            'is_cancel': ['sum', 'mean', 'last'],
            'transaction_date': ['count', 'last']
        }).round(2)
        
        # Flatten column names
        transaction_agg.columns = ['_'.join(col).strip() for col in transaction_agg.columns]
        transaction_agg = transaction_agg.reset_index()
        
        # Add features from latest transaction
        latest_features = latest_transactions[['msno', 'membership_expire_date']].copy()
        latest_features['days_to_expire'] = (latest_features['membership_expire_date'] - pd.Timestamp('2017-03-31')).dt.days
        
        # Drop the datetime column, keep only the numeric feature
        latest_features = latest_features[['msno', 'days_to_expire']]
        
        # Merge features
        transaction_features = transaction_agg.merge(latest_features, on='msno', how='left')
        
        # Calculate discount rate
        transaction_features['discount_rate'] = (
            transaction_features['plan_list_price_last'] - transaction_features['actual_amount_paid_last']
        ) / transaction_features['plan_list_price_last']
        
        return transaction_features
    
    def create_behavior_features(self):
        """Create features from user logs data"""
        # Calculate behavior metrics for each user
        behavior_agg = self.user_logs.groupby('msno').agg({
            'num_25': ['sum', 'mean', 'std'],
            'num_50': ['sum', 'mean', 'std'],
            'num_75': ['sum', 'mean', 'std'],
            'num_985': ['sum', 'mean', 'std'],
            'num_100': ['sum', 'mean', 'std'],
            'num_unq': ['sum', 'mean', 'std', 'max'],
            'total_secs': ['sum', 'mean', 'std', 'max'],
            'date': ['count']  # number of active days
        }).round(2)
        
        # Flatten column names
        behavior_agg.columns = ['_'.join(col).strip() for col in behavior_agg.columns]
        behavior_agg = behavior_agg.reset_index()
        
        # Calculate derived features
        total_songs = (behavior_agg['num_25_sum'] + behavior_agg['num_50_sum'] + 
                      behavior_agg['num_75_sum'] + behavior_agg['num_985_sum'] + 
                      behavior_agg['num_100_sum'])
        
        behavior_agg['total_songs'] = total_songs
        behavior_agg['completion_rate'] = behavior_agg['num_100_sum'] / (total_songs + 1)
        behavior_agg['skip_rate'] = behavior_agg['num_25_sum'] / (total_songs + 1)
        behavior_agg['engagement_score'] = (behavior_agg['num_75_sum'] + behavior_agg['num_985_sum'] + 
                                           behavior_agg['num_100_sum']) / (total_songs + 1)
        behavior_agg['avg_daily_songs'] = total_songs / (behavior_agg['date_count'] + 1)
        behavior_agg['avg_daily_listening_time'] = behavior_agg['total_secs_sum'] / (behavior_agg['date_count'] + 1)
        
        return behavior_agg
    
    def prepare_model_data(self, features_df, is_train=True):
        """Prepare data for modeling"""
        # Drop datetime columns that shouldn't be in the model
        datetime_cols = features_df.select_dtypes(include=['datetime64']).columns
        features_df = features_df.drop(columns=datetime_cols)
        
        # Encode categorical variables
        categorical_cols = ['city', 'age_group', 'gender', 'registered_via']
        
        for col in categorical_cols:
            if col in features_df.columns:
                if is_train:
                    # Fit and transform for training data
                    if col not in self.label_encoders:
                        self.label_encoders[col] = LabelEncoder()
                        features_df[col] = self.label_encoders[col].fit_transform(features_df[col].astype(str))
                    else:
                        features_df[col] = self.label_encoders[col].fit_transform(features_df[col].astype(str))
                else:
                    # Transform only for test data, handle unseen categories
                    if col in self.label_encoders:
                        # Handle unseen categories by mapping them to a default value
                        original_classes = set(self.label_encoders[col].classes_)
                        features_df[col] = features_df[col].astype(str)
                        
                        # Map unseen categories to the most frequent class
                        most_frequent_class = self.label_encoders[col].classes_[0]
                        features_df[col] = features_df[col].apply(
                            lambda x: x if x in original_classes else most_frequent_class
                        )
                        features_df[col] = self.label_encoders[col].transform(features_df[col])
                    else:
                        # If encoder doesn't exist, fill with 0
                        features_df[col] = 0
        
        # Fill missing values
        numeric_cols = features_df.select_dtypes(include=[np.number]).columns
        features_df[numeric_cols] = features_df[numeric_cols].fillna(features_df[numeric_cols].median())
        
        # Ensure all columns are numeric
        for col in features_df.columns:
            if col not in ['msno', 'is_churn']:
                features_df[col] = pd.to_numeric(features_df[col], errors='coerce')
        
        # Fill any remaining NaN values created by conversion
        numeric_cols = features_df.select_dtypes(include=[np.number]).columns
        features_df[numeric_cols] = features_df[numeric_cols].fillna(0)
        
        return features_df
    
    def train_models(self, X_train, y_train, X_val, y_val):
        """Train multiple models"""
        print("Training models...")
        
        # Define models
        models = {
            'random_forest': RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1),
            'gradient_boosting': GradientBoostingClassifier(n_estimators=200, random_state=42),
            'logistic_regression': LogisticRegression(random_state=42),
            'lightgbm': lgb.LGBMClassifier(random_state=42, n_jobs=-1),
            'xgboost': xgb.XGBClassifier(random_state=42, n_jobs=-1)
        }
        
        results = {}
        
        for name, model in models.items():
            print(f"Training {name}...")
            
            try:
                # Train model
                model.fit(X_train, y_train)
                
                # Predict on validation set
                y_pred = model.predict(X_val)
                y_pred_proba = model.predict_proba(X_val)[:, 1]
                
                # Calculate metrics
                auc_score = roc_auc_score(y_val, y_pred_proba)
                
                results[name] = {
                    'model': model,
                    'auc_score': auc_score,
                    'predictions': y_pred,
                    'probabilities': y_pred_proba
                }
                
                print(f"{name} AUC: {auc_score:.4f}")
                
                # Store feature importance for tree-based models
                if hasattr(model, 'feature_importances_'):
                    self.feature_importance[name] = model.feature_importances_
                    
            except Exception as e:
                print(f"Error training {name}: {str(e)}")
                continue
        
        self.models = results
        return results
    
    def create_ensemble(self, X_val, y_val):
        """Create ensemble prediction"""
        print("Creating ensemble...")
        
        if not self.models:
            raise ValueError("No models were successfully trained!")
        
        # Weight models by their AUC scores
        total_auc = sum([result['auc_score'] for result in self.models.values()])
        weights = {name: result['auc_score'] / total_auc for name, result in self.models.items()}
        
        # Create weighted ensemble
        ensemble_proba = np.zeros(len(X_val))
        for name, result in self.models.items():
            ensemble_proba += weights[name] * result['probabilities']
        
        ensemble_auc = roc_auc_score(y_val, ensemble_proba)
        print(f"Ensemble AUC: {ensemble_auc:.4f}")
        
        self.ensemble_weights = weights
        return ensemble_proba
    
    def plot_feature_importance(self, model_name='lightgbm', top_n=20):
        """Plot feature importance"""
        if model_name in self.feature_importance:
            feature_names = self.feature_names
            importance = self.feature_importance[model_name]
            
            # Get top features
            feature_importance_df = pd.DataFrame({
                'feature': feature_names,
                'importance': importance
            }).sort_values('importance', ascending=False).head(top_n)
            
            plt.figure(figsize=(10, 8))
            sns.barplot(data=feature_importance_df, x='importance', y='feature')
            plt.title(f'Top {top_n} Feature Importance - {model_name.upper()}')
            plt.tight_layout()
            plt.show()
    
    def generate_predictions(self, test_features):
        """Generate final predictions for test set"""
        print("Generating final predictions...")
        
        # Use ensemble approach
        ensemble_proba = np.zeros(len(test_features))
        
        for name, weight in self.ensemble_weights.items():
            model = self.models[name]['model']
            proba = model.predict_proba(test_features)[:, 1]
            ensemble_proba += weight * proba
        
        return ensemble_proba
    
    def run_complete_pipeline(self):
        """Run the complete modeling pipeline"""
        print("=== CHURN PREDICTION MODEL PIPELINE ===")
        
        # Load and preprocess data
        self.load_data()
        self.preprocess_data()
        
        # Engineer features
        train_features = self.engineer_features('train')
        test_features = self.engineer_features('test')
        
        # Prepare model data
        train_features = self.prepare_model_data(train_features, is_train=True)
        test_features = self.prepare_model_data(test_features, is_train=False)
        
        # Separate features and target
        feature_cols = [col for col in train_features.columns if col not in ['msno', 'is_churn']]
        self.feature_names = feature_cols
        
        print(f"Feature columns: {len(feature_cols)}")
        print(f"Data types in train_features:")
        print(train_features[feature_cols].dtypes.value_counts())
        
        X = train_features[feature_cols].copy()
        y = train_features['is_churn'].copy()
        X_test = test_features[feature_cols].copy()
        
        # Ensure all feature columns are numeric
        print("Converting all features to numeric...")
        for col in feature_cols:
            X[col] = pd.to_numeric(X[col], errors='coerce')
            X_test[col] = pd.to_numeric(X_test[col], errors='coerce')
        
        # Fill any NaN values
        X = X.fillna(0)
        X_test = X_test.fillna(0)
        
        print(f"Final data types:")
        print(X.dtypes.value_counts())
        
        # Final validation - ensure no object or datetime columns
        non_numeric_cols = X.select_dtypes(exclude=[np.number]).columns
        if len(non_numeric_cols) > 0:
            print(f"Warning: Found non-numeric columns: {list(non_numeric_cols)}")
            for col in non_numeric_cols:
                X[col] = pd.to_numeric(X[col], errors='coerce').fillna(0)
                X_test[col] = pd.to_numeric(X_test[col], errors='coerce').fillna(0)
        
        print(f"Data validation complete. All columns are numeric: {X.dtypes.apply(lambda x: np.issubdtype(x, np.number)).all()}")
        
        # Split training data
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, 
                                                          random_state=42, stratify=y)
        
        print(f"Training set: {X_train.shape}")
        print(f"Validation set: {X_val.shape}")
        print(f"Test set: {X_test.shape}")
        print(f"Churn rate in training: {y_train.mean():.3f}")
        
        # Train models
        results = self.train_models(X_train, y_train, X_val, y_val)
        
        # Create ensemble
        ensemble_proba = self.create_ensemble(X_val, y_val)
        
        # Generate test predictions
        test_predictions = self.generate_predictions(X_test)
        
        # Create submission file
        submission = pd.DataFrame({
            'msno': test_features['msno'],
            'is_churn': test_predictions
        })
        submission.to_csv('submission.csv', index=False)
        print("Submission file saved as 'submission.csv'")
        
        return submission

# Run the model
if __name__ == "__main__":
    model = ChurnPredictionModel()
    
    # Run complete pipeline
    submission = model.run_complete_pipeline()
    
    # Plot feature importance
    model.plot_feature_importance()
    
    print("Model training completed!")
    print(f"Final submission shape: {submission.shape}")
    print("\nSample predictions:")
    print(submission.head(10))