In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


# Random Forest

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load the datasets
train_data = pd.read_csv("/content/drive/MyDrive/P2/T1/Dataset/combined/ML-c-train.csv")
valid_data = pd.read_csv("/content/drive/MyDrive/P2/T1/Dataset/combined/ML-c-val.csv")
test_data = pd.read_csv("/content/drive/MyDrive/P2/T1/Dataset/combined/ML-c-test.csv")

# Separate features and target
X_train, y_train = train_data.drop(columns=["disease_label"]), train_data["disease_label"]
X_valid, y_valid = valid_data.drop(columns=["disease_label"]), valid_data["disease_label"]
X_test, y_test = test_data.drop(columns=["disease_label"]), test_data["disease_label"]

# Ensure all feature columns are numeric
X_train = X_train.select_dtypes(include=["number"])
X_valid = X_valid.select_dtypes(include=["number"])
X_test = X_test.select_dtypes(include=["number"])

# Ensure the same columns in all splits
common_columns = X_train.columns.intersection(X_valid.columns).intersection(X_test.columns)
X_train = X_train[common_columns]
X_valid = X_valid[common_columns]
X_test = X_test[common_columns]

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Validate the Model
y_valid_pred = rf_model.predict(X_valid)
val_report = classification_report(y_valid, y_valid_pred)
print("Validation Report:")
print(val_report)

# Test the Model
y_test_pred = rf_model.predict(X_test)
test_report = classification_report(y_test, y_test_pred)
print("Test Report:")
print(test_report)


Validation Report:
                                    precision    recall  f1-score   support

                    adenocarcinoma       0.85      0.96      0.90       163
                          adhesion       0.92      0.96      0.94       164
                     affect labile       0.96      0.91      0.94        79
                         arthritis       0.94      0.97      0.95       131
                            asthma       0.82      1.00      0.90        71
      benign prostatic hypertrophy       0.95      1.00      0.98        79
                  biliary calculus       0.81      0.85      0.83       190
                  carcinoma breast       0.96      0.96      0.96       170
chronic obstructive airway disease       0.98      0.82      0.89       143
                         cirrhosis       0.98      0.97      0.97       170
                           colitis       0.90      0.92      0.91       190
         coronary arteriosclerosis       0.81      0.75      0.78   

In [3]:
feature_importance = pd.Series(rf_model.feature_importances_, index=X_train.columns)
print(feature_importance.sort_values(ascending=False).head(10))  # Top 10 important features

pain                   0.009958
sore to touch          0.009905
shortness of breath    0.009617
pain abdominal         0.009414
cough                  0.009093
vomiting               0.008676
diarrhea               0.008614
fever                  0.008585
dyspnea                0.008095
hypokinesia            0.007903
dtype: float64


# Saved the predicted out in csv

In [4]:
"""
import pandas as pd

# Combine the real labels, predicted labels, and optionally the testing features
results = pd.DataFrame({
    "Real Label": y_test.values,
    "Predicted Label": y_pred
})

# Optionally, include testing features for context
testing_features = X_test.reset_index(drop=True)  # Reset index to align with predictions
results_with_features = pd.concat([testing_features, results], axis=1)

# Save results to a CSV file
results_with_features.to_csv("/content/drive/MyDrive/P2/Dataset/DiseasePrediction2/RFpredicted_output.csv", index=False)

print("Predicted output saved to 'predicted_output.csv'")
"""

'\nimport pandas as pd\n\n# Combine the real labels, predicted labels, and optionally the testing features\nresults = pd.DataFrame({\n    "Real Label": y_test.values,\n    "Predicted Label": y_pred\n})\n\n# Optionally, include testing features for context\ntesting_features = X_test.reset_index(drop=True)  # Reset index to align with predictions\nresults_with_features = pd.concat([testing_features, results], axis=1)\n\n# Save results to a CSV file\nresults_with_features.to_csv("/content/drive/MyDrive/P2/Dataset/DiseasePrediction2/RFpredicted_output.csv", index=False)\n\nprint("Predicted output saved to \'predicted_output.csv\'")\n'

# Generate Classification Report

In [5]:

import pandas as pd
from sklearn.metrics import classification_report

# Assuming rf_model is your trained model, and X_test, y_test are defined
y_pred = rf_model.predict(X_test)

# Generate the classification report as a dictionary
report_dict = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

# Convert the dictionary to a pandas DataFrame
report_df = pd.DataFrame(report_dict).transpose()

# Save the classification report as a CSV file
report_df.to_csv("/content/drive/MyDrive/P2/T1/Dataset/combined/ML/RF-c-class.csv", index=True)

print("Classification report saved.")


Classification report saved.
