In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

train_df = pd.read_csv('./datasets/cirrhosis/train.csv')
test_df = pd.read_csv('./datasets/cirrhosis/test.csv')

In [4]:
# Encode target variable
label_encoder = LabelEncoder()
train_df['Status'] = label_encoder.fit_transform(train_df['Status'])

In [6]:
# Combine datasets for consistency in encoding
combined_df = pd.concat([train_df, test_df], ignore_index=True, sort=False)

In [8]:
# Encode other categorical variable
cat_cols = ['Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema']
for feature in cat_cols:
    combined_df[feature] = label_encoder.fit_transform(combined_df[feature].fillna('Unknown'))

In [12]:
# Split the datasets back
train_df = combined_df.iloc[:len(train_df)].copy()
test_df = combined_df.iloc[len(train_df):].copy()

In [14]:
# Dropping cols with >50% null values
col_drop = ['Cholesterol', 'Tryglicerides']
train_df.drop(columns=col_drop, inplace=True)
test_df.drop(columns=col_drop, inplace=True)

In [16]:
# Imputing numerical null values
num_imputer = SimpleImputer(strategy = 'median')

impute_cols = ['Copper', 'Alk_Phos', 'SGOT', 'Platelets', 'Prothrombin']
for features in impute_cols:
    train_df[features] = num_imputer.fit_transform(train_df[[features]])
    test_df[features] = num_imputer.fit_transform(test_df[[features]])

In [18]:
# Filling missing Drug values with Unknown and encoding
train_df['Drug'] = train_df['Drug'].fillna('Unknown')
test_df['Drug'] = test_df['Drug'].fillna('Unknown')

train_df['Drug'] = label_encoder.fit_transform(train_df['Drug'])
test_df['Drug'] = label_encoder.fit_transform(test_df['Drug'])

In [87]:
X_train = train_df.drop('Status', axis=1)
y_train = train_df['Status']
X_test = test_df.drop('Status', axis=1)

In [91]:
# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [34]:
# Convert scaled arrays back to DataFrames to retain feature names
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [62]:
model = LogisticRegression(max_iter=2000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict_proba(X_test)

In [64]:
# Cross validation
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean Cross-Validation Accuracy: {np.mean(cv_scores)}")

Cross-Validation Accuracy Scores: [0.83266667 0.826      0.83433333 0.82266667 0.82466667]
Mean Cross-Validation Accuracy: 0.8280666666666667


In [82]:
# Creating submission df
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'Status_C': y_pred[:, 0],
    'Status_CL': y_pred[:, 1],
    'Status_D': y_pred[:, 2]
})

submission_df.to_csv('cirrhosis-submission.csv', index=False)