In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [19]:
data = pd.read_csv('data/data_devsample.csv')
df = pd.DataFrame(data)

# Define features and target variable
X = df.drop(['SK_ID_CURR', 'TARGET'], axis=1)
y = df['TARGET']

# Preprocessing: Numerical and Categorical pipeline
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [20]:
df.isnull().sum()

SK_ID_CURR                          0
TARGET                              0
NAME_CONTRACT_TYPE                  0
CODE_GENDER                         0
FLAG_OWN_CAR                        0
                                ...  
LAST_TRANSACTION_TIME_MONTHS    30353
TIME                                0
BASE                                0
DAY                                 0
MONTH                               0
Length: 197, dtype: int64

In [22]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Replace inf/-inf with NaN in numeric columns
for col in numerical_features:
    X_train[col] = X_train[col].replace([np.inf, -np.inf], np.nan)
    X_test[col] = X_test[col].replace([np.inf, -np.inf], np.nan)

# Optional: Clip numeric columns to a maximum value
max_value_threshold = 1e6  # Example threshold
for col in numerical_features:
    X_train[col] = X_train[col].clip(upper=max_value_threshold)
    X_test[col] = X_test[col].clip(upper=max_value_threshold)

# Preprocessing: Numerical and Categorical pipeline
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ]
)

# Create a logistic regression pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', LogisticRegression())])

In [None]:
model

In [23]:
# Train the model
model.fit(X_train, y_train)

# Evaluate the model using Gini coefficient (2 * AUC - 1)
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
gini_coefficient = 2 * roc_auc - 1
print(f'Gini Coefficient: {gini_coefficient}')

# Generate predictions and save to CSV
predictions = model.predict_proba(X_test)[:, 1]
output = pd.DataFrame({'SK_ID_CURR': X_test.index, 'SCORE': predictions})
output.to_csv('submission.csv', index=False)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Gini Coefficient: 0.45953767053113737


In [None]:
# Generate predictions and save to CSV
predictions = model.predict_proba(X_test)[:, 1]
output = pd.DataFrame({'SK_ID_CURR': X_test.index, 'SCORE': predictions})
output.to_csv('submission.csv', index=False)