## 1. Load and Prepare Data

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import brier_score_loss

# Load the training data
train_data = pd.read_csv('train.csv')

# Explore the data
print(train_data.head())

# Encode categorical variables using one-hot encoding
categorical_cols = ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']
train_data_encoded = pd.get_dummies(train_data, columns=categorical_cols)

# Separate features and target
X = train_data_encoded.drop(['ID', 'Default'], axis=1)
y = train_data_encoded['Default']

# Preprocessing: Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize and train the logistic regression model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# Predict probabilities on the validation set
probabilities = model.predict_proba(X_val)[:, 1]  # get the probability of the positive class

# Evaluate the model using Brier score loss
brier_score = brier_score_loss(y_val, probabilities)
print(f"Brier score loss: {brier_score}")


   ID  Age  Income  LoanAmount  CreditScore  MonthsEmployed  NumCreditLines   
0   0   21   78304      168713          653              60               1  \
1   1   28   63751       84674          681              58               1   
2   2   57   96676      167540          467              98               4   
3   3   24   79289       61546          358              63               4   
4   4   31   98586      232342          692              10               2   

   InterestRate  LoanTerm  DTIRatio    Education EmploymentType MaritalStatus   
0          8.80        60      0.59  High School      Part-time        Single  \
1          4.91        48      0.21          PhD      Part-time       Married   
2         16.78        36      0.63  High School     Unemployed        Single   
3          6.40        60      0.83     Master's      Full-time        Single   
4         19.97        60      0.51          PhD     Unemployed       Married   

  HasMortgage HasDependents LoanPurpos

## 2. data processing

In [10]:
# Assuming your preprocessing and model training steps are satisfactory and finalized.

# Load the test data
test_data = pd.read_csv('test.csv')

# Apply the same preprocessing to the test data
test_data_encoded = pd.get_dummies(test_data, columns=categorical_cols)
# Ensure the test data has the same features as the training data, filling missing columns with zeros
test_data_encoded = test_data_encoded.reindex(columns=X.columns, fill_value=0)

# Scale the test data using the same scaler used for the train data
X_test_scaled = scaler.transform(test_data_encoded)

# Predict probabilities with the trained model
test_probabilities = model.predict_proba(X_test_scaled)[:, 1]

# Create submission DataFrame
submission = pd.DataFrame({
    'ID': test_data['ID'],
    'TARGET': test_probabilities
})

# Save the submission file
submission.to_csv('submission.csv', index=False)
print("Submission file created.")


Submission file created.


Version 2

In [3]:
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.metrics import BinaryAccuracy
from tensorflow.keras.optimizers import Adam
import joblib

X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Model configuration
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')  # Output layer with sigmoid activation for binary classification
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=[BinaryAccuracy(name='accuracy'), tf.keras.metrics.AUC(name='auc')])

# Model summary
model.summary()

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_val, y_val), verbose=1)

# Evaluate the model on the validation set
val_loss, val_accuracy, val_auc = model.evaluate(X_val, y_val, verbose=0)
print(f'Validation Loss: {val_loss}')
print(f'Validation Accuracy: {val_accuracy}')
print(f'Validation AUC: {val_auc}')

model.save('model_1.h5')
joblib.dump(scaler, 'my_scaler.gz')




Metal device set to: Apple M1

systemMemory: 8.00 GB
maxCacheSize: 2.67 GB





Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               4096      
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 32)                2080      
                                                                 
 dropout_2 (Dropout)         (None, 32)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 3

2024-04-29 15:07:48.324738: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Validation Loss: 0.37599506974220276
Validation Accuracy: 0.8485333323478699
Validation AUC: 0.7496305108070374


['my_scaler.gz']

In [6]:
# Load and preprocess the training data
train_data = pd.read_csv('train.csv')
categorical_cols = ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']
train_data_encoded = pd.get_dummies(train_data, columns=categorical_cols)

# Separate features and target
X = train_data_encoded.drop(['ID', 'Default'], axis=1)
y = train_data_encoded['Default']

# Keep the feature names after encoding for later use
feature_names = X.columns

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Load model and scaler
model = tf.keras.models.load_model('model_1.h5')
scaler = joblib.load('my_scaler.gz')

# Load and prepare the test data
test_data = pd.read_csv('test.csv')
test_data_encoded = pd.get_dummies(test_data, columns=categorical_cols)

# Ensure the test data has the same features as the training data, filling missing columns with zeros
test_data_encoded = test_data_encoded.reindex(columns=feature_names, fill_value=0)

# Scale the test data using the same scaler used for the train data
X_test_scaled = scaler.transform(test_data_encoded)

# Predicting probabilities on the test set
test_probabilities = model.predict(X_test_scaled).flatten()

# Creating a submission DataFrame
submission = pd.DataFrame({
    'ID': test_data['ID'],
    'TARGET': test_probabilities
})

# Save the submission file
submission.to_csv('neural_network_submission.csv', index=False)
print("Neural network submission file created.")

Neural network submission file created.


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import brier_score_loss
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

# Load the training data
train_data = pd.read_csv('train.csv')

# Identify categorical and numerical columns
categorical_cols = ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']
numerical_cols = ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio']

# Define the preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Append classifier to preprocessing pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))])

# Separate features and target
X = train_data.drop(['ID', 'Default'], axis=1)
y = train_data['Default']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest model
clf.fit(X_train, y_train)

# Predict probabilities on the validation set
rf_probabilities = clf.predict_proba(X_val)[:, 1]

# Evaluate the model using Brier score loss
rf_brier_score = brier_score_loss(y_val, rf_probabilities)
print(f"Random Forest Brier score loss: {rf_brier_score}")


Random Forest Brier score loss: 0.11295756


In [3]:
import pandas as pd

# Load the test data
test_data = pd.read_csv('test.csv')

# Predict probabilities using the pipeline
# The pipeline will automatically handle the preprocessing
test_probabilities = clf.predict_proba(test_data.drop(['ID'], axis=1))[:, 1]

# Create the submission DataFrame
submission = pd.DataFrame({
    'ID': test_data['ID'],
    'TARGET': test_probabilities
})

# Save the submission file
submission.to_csv('random_forest.csv', index=False)
print("Submission file created successfully.")


Submission file created successfully.


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import brier_score_loss
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

# Load the training data
train_data = pd.read_csv('train.csv')

# Identify categorical and numerical columns
categorical_cols = ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']
numerical_cols = ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio']

# Define the preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Append classifier to preprocessing pipeline
# RandomForest with AdaBoost
rf = RandomForestClassifier(n_estimators=10, random_state=42)  # Using fewer trees for demonstration
ada_boost = AdaBoostClassifier(base_estimator=rf, n_estimators=50, random_state=42)

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', ada_boost)])

# Separate features and target
X = train_data.drop(['ID', 'Default'], axis=1)
y = train_data['Default']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the AdaBoosted RandomForest model
clf.fit(X_train, y_train)

# Predict probabilities on the validation set
probabilities = clf.predict_proba(X_val)[:, 1]

# Evaluate the model using Brier score loss
brier_score = brier_score_loss(y_val, probabilities)
print(f"Brier score loss with AdaBoosted RandomForest: {brier_score}")




Brier score loss with AdaBoosted RandomForest: 0.11919698546731154
