In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/playground-series-s4e8/sample_submission.csv
/kaggle/input/playground-series-s4e8/train.csv
/kaggle/input/playground-series-s4e8/test.csv


In [2]:
train = pd.read_csv("/kaggle/input/playground-series-s4e8/train.csv")

In [3]:
test = pd.read_csv("/kaggle/input/playground-series-s4e8/test.csv")

# ***PREPROCESSING***

In [4]:
features_to_drop = ['veil-type', 'does-bruise-or-bleed', 'has-ring', 'id']
train = train.drop(features_to_drop, axis=1)

In [5]:
test_sub = test.drop(features_to_drop, axis=1)

In [6]:
X = train.drop('class', axis=1)
y = train['class']

In [7]:
from sklearn.pipeline import Pipeline 
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Select numerical features 
num_features = X.select_dtypes(include='float64').columns
# Select categorical features 
cat_features = X.select_dtypes(include='object').columns


# Impute median to all numerical features 
num_pipeline = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy='median')),
])

# Impute most frequent category to all categorical features
cat_pipeline = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ("encoder", OneHotEncoder(drop='first', min_frequency=300))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_features),
        ('cat', cat_pipeline, cat_features)
    ]
)

In [8]:
X = preprocessor.fit_transform(X)

In [9]:
test_sub = preprocessor.fit_transform(test_sub)

In [10]:
# ***MODEL PERFORMANCE***

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import matthews_corrcoef

model = LogisticRegression() 

folds = 5
skfolds = StratifiedKFold(n_splits=folds)

total_mcc = []

for train_index, test_index, in skfolds.split(X, y):
    X_train_folds, y_train_folds = X[train_index], y[train_index]
    X_test_folds, y_test_folds = X[test_index], y[test_index]
    
    model.fit(X_train_folds, y_train_folds)
    y_pred_folds = model.predict(X_test_folds)
    
    mcc = matthews_corrcoef(y_test_folds, y_pred_folds)
    total_mcc.append(mcc)

In [12]:
sum(total_mcc) / len(total_mcc)

0.6574105038961314

# ***SUBMISSION***

In [13]:
model.fit(X, y)

In [14]:
pred_class = model.predict(test_sub)

test['class'] = pred_class 

submission = test[['id', 'class']]

In [15]:
submission.to_csv('submission.csv', index=False)