In [None]:
# Mengimport library yang diperlukan untuk proses pengerjaan
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from google.colab import drive


In [None]:
drive.mount('/drive')
fileScores = '/drive/MyDrive/ml_quiz/scores.csv'
scores = pd.read_csv(fileScores)

drive.mount('/drive')
fileKondisi = '/drive/MyDrive/ml_quiz/kondisi.csv'
kondisi = pd.read_csv(fileKondisi)

drive.mount('/drive')
fileControl = '/drive/MyDrive/ml_quiz/control.csv'
control = pd.read_csv(fileControl)

In [None]:
# Pemrosesan Awal Data

scores.shape
kondisi.shape
control.shape

In [None]:
# Mengasumsikan depresi_status sebagai kolom target
target_column = 'depression_status'

In [None]:
df = pd.concat([scores, kondisi, control])
df.columns

In [None]:
# Mengeliminasi data
import re
import string

def optional(text):
  text = text.lower()
  text = re.sub('\[.*?\]', '', text)
  text = re.sub('\W', " ", text)
  text = re.sub('https?://\S+|www\.\S+', '', text)
  text = re.sub('<.*?>+', '', text)
  text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
  text = re.sub('\n', '', text)
  text = re.sub('\w*\d\w*', '', text)
  return text

In [None]:
# Menangani nilai yang hilang
scores = scores.dropna()

kondisi = kondisi.dropna()

control = control.dropna()

In [None]:
# Pengkodean variable kategori
le = LabelEncoder()
for column in scores.select_dtypes(include=['object']).columns:
    scores[column] = le.fit_transform(scores[column])

for column in kondisi.select_dtypes(include=['object']).columns:
    kondisi[column] = le.fit_transform(kondisi[column])


In [None]:
# Penskalaan fitur
scaler = StandardScaler()

In [None]:
# Memeriksa apakah kolom 'depresi_status' ada sebelum dihapus
if 'depression_status' in scores.columns:
    scaled_features = scaler.fit_transform(scores.drop('depression_status', axis=1))
    scores_scaled = pd.DataFrame(scaled_features, columns=scores.columns[:-1])
    scores_scaled['depression_status'] = scores['depression_status']

if 'depression_status' in scores.columns:
    scaled_features = scaler.fit_transform(kondisi.drop('depression_status', axis=1))
    kondisi_scaled = pd.DataFrame(scaled_features, columns=scores.columns[:-1])
    kondisi_scaled['depression_status'] = kondisi['depression_status']

if 'depression_status' in scores.columns:
    scaled_features = scaler.fit_transform(control.drop('depression_status', axis=1))
    control_scaled = pd.DataFrame(scaled_features, columns=scores.columns[:-1])
    control_scaled['depression_status'] = control['depression_status']

else:
    print("Column 'depression_status' not found in DataFrame.")



In [None]:
# Exploratory Data Analysis (EDA)

# Print the column names of df to verify the available columns
print(scores.columns)

print(kondisi.columns)

In [None]:
# Correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(scores.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(kondisi.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# MemVerifikasi nama kolom di data frame
print(scores.columns)

In [None]:
print(kondisi.columns)

In [None]:
X = scores.drop('inpatient', axis=1)
y = scores['inpatient']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


print(X_train.dtypes)
print(X_train.isnull().sum())


In [None]:
# Ensure shapes of the training and test sets
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
# Import the necessary library
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

In [None]:
# Model Evaluation
models = {
    'Random Forest': y_pred_rf
}

In [None]:
for model_name, y_pred in models.items():
    print(f"{model_name}:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("\n")

In [None]:

# Cross-validation for model validation
cv_scores = cross_val_score(rf, X, y, cv=5)
print("Cross-Validation Scores (Random Forest):", cv_scores)
print("Mean CV Score (Random Forest):", np.mean(cv_scores))