In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

# Load your dataset (adjust the path as needed)
data = pd.read_csv("D:/semester4/90089/90089/final_ml_data.csv")

# Fill NaN values with 0 in columns 'N170', 'N171', 'N172', 'N178', 'N179' (assuming NaN means no condition)
data[['N170', 'N171', 'N172', 'N178', 'N179']] = data[['N170', 'N171', 'N172', 'N178', 'N179']].fillna(0)

# Create target variable for AKI occurrence
data['aki_occurred'] = data[['N170', 'N171', 'N172', 'N178', 'N179']].max(axis=1)

# Ensure there are no NaN values in feature columns
feature_cols = [
    'drug_encoded', 'creatinine_level', 'previous_creatinine_level', 
    'creatinine_change', 'gender_encoded', 'anchor_age', 
    'race_encoded', 'avg_bmi'
]
data[feature_cols] = data[feature_cols].fillna(0)  # You can also choose to drop rows instead: data.dropna(subset=feature_cols, inplace=True)

# Remove duplicate records based on 'subject_id' and 'hadm_id'
data = data.drop_duplicates(subset=['subject_id', 'hadm_id'])

# Target variable
target_col = 'aki_occurred'

# Splitting data into features (X) and target (y)
X = data[feature_cols]
y = data[target_col]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Check for imbalance in the target variable
print("Class distribution in target variable 'aki_occurred':")
print(y_train.value_counts())

# If imbalance exists, use SMOTE for oversampling
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Re-check the class distribution in the resampled training data
print("Class distribution after applying SMOTE:")
print(pd.Series(y_train_resampled).value_counts())

# Train the SVM Classifier on resampled data
svm_classifier = SVC(kernel='rbf', random_state=42)
svm_classifier.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred_svm = svm_classifier.predict(X_test)

# Evaluate the SVM model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy after handling class imbalance: {accuracy_svm:.4f}")

# Classification report for more detailed performance metrics
print("SVM Classification Report after handling class imbalance:")
print(classification_report(y_test, y_pred_svm))

Class distribution in target variable 'aki_occurred':
aki_occurred
0.0    72930
1.0    10237
Name: count, dtype: int64
Class distribution after applying SMOTE:
aki_occurred
1.0    72930
0.0    72930
Name: count, dtype: int64
SVM Accuracy after handling class imbalance: 0.3782
SVM Classification Report after handling class imbalance:
              precision    recall  f1-score   support

         0.0       0.93      0.31      0.47     31292
         1.0       0.15      0.84      0.25      4351

    accuracy                           0.38     35643
   macro avg       0.54      0.58      0.36     35643
weighted avg       0.84      0.38      0.44     35643

