In [37]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import shap
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Step 1: Load the dataset
def load_data_in_chunks(file_path, nrows=10000):
    """Load a specific number of rows from a large CSV file and add a Serial Number column."""
    chunk = pd.read_csv(file_path, nrows=nrows)
    chunk['Serial'] = range(1, len(chunk) + 1)  # Adding Serial Number column starting from 1
    print(f"Loaded {nrows} rows from {file_path}")
    return chunk

train_data = load_data_in_chunks(r"E:\AYUSH\amex-default-prediction\train_data.csv", nrows=10000)
test = load_data_in_chunks(r"E:\AYUSH\amex-default-prediction\test_data.csv", nrows=10000)
train_labels = load_data_in_chunks(r"E:\AYUSH\amex-default-prediction\train_labels.csv", nrows=10000)
sample_submission = load_data_in_chunks(r"E:\AYUSH\amex-default-prediction\sample_submission.csv", nrows=10000)


print("\nTop 10 rows of Train Data with Serial Numbers:")
print(train_data.head(10))
print("\nTop 10 rows of Test Data with Serial Numbers:")
print(test.head(10))
print("\nTop 10 rows of Train Data with Serial Numbers:")
print(train_labels.head(10))
print("\nTop 10 rows of Train Data with Serial Numbers:")
print(sample_submission.head(10))

Loaded 10000 rows from E:\AYUSH\amex-default-prediction\train_data.csv
Loaded 10000 rows from E:\AYUSH\amex-default-prediction\test_data.csv
Loaded 10000 rows from E:\AYUSH\amex-default-prediction\train_labels.csv
Loaded 10000 rows from E:\AYUSH\amex-default-prediction\sample_submission.csv

Top 10 rows of Train Data with Serial Numbers:
                                         customer_ID         S_2       P_2  \
0  0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...  2017-03-09  0.938469   
1  0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...  2017-04-07  0.936665   
2  0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...  2017-05-28  0.954180   
3  0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...  2017-06-13  0.960384   
4  0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...  2017-07-16  0.947248   
5  0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...  2017-08-04  0.945964   
6  0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...  2017-09-18  0.940705   
7  0000099d6bd597052cdcda90ffabf5657

In [3]:
# Step 2: Exploratory Data Analysis (EDA)
def explore_data(data):
    """Perform basic exploration to understand the data."""
    print("Basic Information of the Dataset:")
    print("-" * 50)
    print(data.info())
    print("\nBasic Statistics of Numerical Columns:")
    print("-" * 50)
    print(data.describe().T)
    
    print("\nMissing Values in Top Columns:")
    print("-" * 50)
    print(data.isnull().sum().sort_values(ascending=False).head(10))
    
    # Check if 'target' column exists before analyzing its distribution
    if 'target' in data.columns:
        print("\nTarget Distribution:")
        print("-" * 50)
        print(data['target'].value_counts(normalize=True))
        
        # Visualizing target distribution
        sns.countplot(x='target', data=data)
        plt.title("Target Distribution")
        plt.xlabel("Target Class")
        plt.ylabel("Count")
        plt.show()
    else:
        print("\n'Target' column is not present in the dataset.")

# Call the function
explore_data(train_data)


Basic Information of the Dataset:
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 191 entries, customer_ID to Serial
dtypes: float64(185), int64(2), object(4)
memory usage: 14.6+ MB
None

Basic Statistics of Numerical Columns:
--------------------------------------------------
          count         mean          std           min          25%  \
P_2      9936.0     0.650498     0.252416 -2.569212e-01     0.471264   
D_39    10000.0     0.157031     0.275827  8.701630e-07     0.004574   
B_1     10000.0     0.126147     0.212428 -1.414690e-01     0.009126   
B_2     10000.0     0.617122     0.403145  3.432000e-05     0.091656   
R_1     10000.0     0.074035     0.219977  3.031180e-06     0.002863   
...         ...          ...          ...           ...          ...   
D_142    1572.0     0.370263     0.249771 -8.804185e-03     0.150429   
D_143    9847.0     0.162935     0.364742  2.172557e-06    

In [4]:
# Step 3: Handling Missing Values
def handle_missing_values(train, test):
    """Fill missing values with a placeholder."""
    train.fillna(-999, inplace=True)
    test.fillna(-999, inplace=True)
    return train, test

train, test = handle_missing_values(train_data, test)

In [18]:
# Step 4: Encoding Categorical Variables
from sklearn.preprocessing import LabelEncoder

def encode_categorical(train, test):
    """Label encode categorical features."""
    categorical_cols = train.select_dtypes(include=['object']).columns
    encoder = LabelEncoder()
    
    # Apply encoding for each categorical column
    for col in categorical_cols:
        # Fit on training data and transform both train and test sets
        train[col] = encoder.fit_transform(train[col].astype(str))
        
        # Transform test data using the same encoder
        # We use 'fit_transform' only on training data to avoid test data leakage
        test[col] = encoder.transform(test[col].astype(str))
    
    return train, test

# Apply the encoding function to train and test datasets
train, test = encode_categorical(train_data, test)


In [7]:
from sklearn.preprocessing import StandardScaler

def scale_features(train, test):
    """Standardize numerical features with aligned columns."""
    scaler = StandardScaler()

    # Drop non-numeric columns first
    train_numeric = train.select_dtypes(include=['int64', 'float64'])
    test_numeric = test.select_dtypes(include=['int64', 'float64'])

    # Align columns between train and test
    train_numeric, test_numeric = train_numeric.align(test_numeric, join='inner', axis=1)

    # Fit scaler on train data and transform both train and test
    train_scaled = scaler.fit_transform(train_numeric)
    test_scaled = scaler.transform(test_numeric)

    # Replace the numerical columns in the original train and test DataFrames
    train[train_numeric.columns] = train_scaled
    test[train_numeric.columns] = test_scaled

    return train, test

# Apply feature scaling
train, test = scale_features(train_data, test)

print("Feature scaling completed successfully!")




Feature scaling completed successfully!


In [8]:
# Step 6: Splitting Data
from sklearn.model_selection import train_test_split

def split_train_data(train):
    """Split data into training and validation sets."""
    if not isinstance(train, pd.DataFrame):
        raise ValueError("Input should be a DataFrame.")
    
    X = train.drop(columns=['target', 'id'], errors='ignore')
    y = train['target'] if 'target' in train.columns else None
    
    if y is None:
        raise ValueError("Column 'target' not found in the DataFrame.")
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    return X_train, X_val, y_train, y_val



In [9]:
# Step 7: Custom Evaluation Metric
def amex_metric(y_true, y_pred):
    """Define the AMEX evaluation metric."""
    def top_four_percent_captured(y_true, y_pred):
        df = pd.DataFrame({'y_true': y_true, 'y_pred': y_pred})
        df = df.sort_values('y_pred', ascending=False)
        df['weight'] = df['y_true'].apply(lambda x: 20 if x == 1 else 1)
        cutoff = int(0.04 * df['weight'].sum())
        return df.iloc[:cutoff]['y_true'].sum() / df['y_true'].sum()

    gini = 2 * roc_auc_score(y_true, y_pred) - 1
    return 0.5 * (gini + top_four_percent_captured(y_true, y_pred))

In [10]:
# Step 8: Train Baseline Logistic Regression
def split_train_data(train):
    """Split data into training and validation sets."""
    # Check for available columns
    print("Columns in train:", train.columns)

    # Drop 'target' or 'id' only if they exist
    drop_cols = [col for col in ['target', 'id'] if col in train.columns]
    X = train.drop(columns=drop_cols)  # Drop valid columns only
    
    y = train['target'] if 'target' in train.columns else None  # Ensure 'target' exists

    if y is None:
        raise ValueError("Column 'target' not found in the dataset!")
    
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )
    return X_train, X_val, y_train, y_val




In [11]:
# Step 9: Train Advanced XGBoost Model
from sklearn.model_selection import train_test_split

def split_train_data(train):
    target_column = 'target'  # Update this to the correct column name if needed
    if target_column not in train.columns:
        raise ValueError(f"Column '{target_column}' not found in the dataset!")

    y = train[target_column]
    X = train.drop(columns=[target_column])

    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )
    return X_train, X_val, y_train, y_val




In [12]:
# Step 10: Analyze Feature Importance
# Ensure 'customer_ID' has the same data type in both datasets
train_data["customer_ID"] = train_data["customer_ID"].astype(str)
train_labels["customer_ID"] = train_labels["customer_ID"].astype(str)

# Check if 'customer_ID' is unique before setting it as the index
if train_data["customer_ID"].is_unique and train_labels["customer_ID"].is_unique:
    # Set 'customer_ID' as the index for both DataFrames before concatenation
    train = pd.concat([train_data.set_index("customer_ID"), 
                       train_labels.set_index("customer_ID")], axis=1).reset_index()
else:
    # If 'customer_ID' is not unique, merge based on 'customer_ID' instead
    train = pd.merge(train_data, train_labels, on="customer_ID", how="inner")

# Prepare data for training
X = train.drop(columns=["customer_ID", "target"], errors="ignore")  # Features
y = train["target"]  # Target column

# Ensure that X and y are not empty
if len(X) > 0 and len(y) > 0:
    # Train-validation split with a valid test_size and train_size
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Step 5: Train XGBoost Model
    xgb_model = xgb.XGBClassifier(
        objective="binary:logistic", 
        eval_metric="auc", 
        learning_rate=0.05, 
        max_depth=6, 
        subsample=0.8, 
        colsample_bytree=0.8, 
        random_state=42
    )
    xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=50, verbose=True)

    # Step 6: Feature Importance Visualization
    def analyze_features(model, X_val):
        """
        Visualize feature importance using SHAP and XGBoost.
        """
        # XGBoost Feature Importance
        xgb.plot_importance(model)
        plt.title("XGBoost Feature Importance")
        plt.show()

        # SHAP Feature Importance
        explainer = shap.Explainer(model, X_val)
        shap_values = explainer(X_val)
        shap.summary_plot(shap_values, X_val)

    # Step 7: Call the function
    analyze_features(xgb_model, X_val)

else:
    print("Error: The input data is empty. Please check the data.")

Error: The input data is empty. Please check the data.


In [34]:
# Step 11: Generate Submission File
def encode_categorical(train, test):
    """Label encode categorical features."""
    categorical_cols = train.select_dtypes(include=['object']).columns
    encoder = LabelEncoder()
    for col in categorical_cols:
        train[col] = encoder.fit_transform(train[col].astype(str))
        test[col] = encoder.transform(test[col].astype(str))
    return train, test

# Step 2: Train XGBoost Model without early stopping
def train_xgb_model(train_data, train_labels):
    """Train XGBoost model on the training data."""
    # Prepare training data
    X = train_data.drop(columns=['target'], errors='ignore')  # Drop 'target' if it exists
    y = train_labels['target']

    # Split into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize the XGBoost classifier
    xgb_model = xgb.XGBClassifier(
        objective="binary:logistic", 
        learning_rate=0.05, 
        max_depth=6, 
        subsample=0.8, 
        colsample_bytree=0.8, 
        random_state=42
    )

    # Train the model without early stopping
    xgb_model.fit(X_train, y_train, verbose=True)

    return xgb_model

# Step 3: Generate Submission File
def generate_submission(test, xgb_model):
    """Create the final submission file."""
    # Check if the 'id' column exists in the test DataFrame
    if 'id' not in test.columns:
        print("Error: 'id' column not found in the test DataFrame.")
        return
    
    # Prepare test data (drop 'id' column and other non-predictive columns)
    X_test = test.drop(columns=['id'], errors='ignore')
    
    # Convert test data to DMatrix format for XGBoost
    dtest = xgb.DMatrix(X_test)
    
    # Make predictions (ensure binary classification, or adjust as necessary)
    predictions = xgb_model.predict(dtest)
    
    # If the model is binary classification, convert probabilities to 0 or 1
    if predictions.shape[0] > 1:  # In case of output being probabilities
        predictions = (predictions > 0.5).astype(int)  # Threshold at 0.5
    
    # Create the submission DataFrame
    submission = pd.DataFrame({
        'id': test['id'],
        'prediction': predictions
    })
    
    # Save the submission file to the specified path
    submission.to_csv(r'E:\AYUSH\amex-default-prediction\sample_submission.csv', index=False)
    print("Submission file saved as sample_submission.csv")

# Example usage:
# Assuming 'train_data', 'train_labels', and 'test' are already loaded with the respective data

# Step 4: Encode categorical features (if needed)
train_data, test = encode_categorical(train_data, test)

# Step 5: Train the XGBoost model
xgb_model = train_xgb_model(train_data, train_labels)  # Ensure 'train_data' and 'train_labels' are defined

# Step 6: Generate the submission file
generate_submission(test, xgb_model)



Error: 'id' column not found in the test DataFrame.
