In [None]:
!pip install scikit-learn
!pip install pandas



In [None]:

# Step 2: Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import numpy as np # For handling potential NaN issues if not dropped

# Step 3: Load the dataset
# Adjust the path if necessary
try:
    # If uploaded to Colab's root
    df = pd.read_csv('/content/drug_interactions.csv')
except FileNotFoundError:
    # If in Google Drive (example path)
    df = pd.read_csv('/content/drive/MyDrive/drug_interactions.csv') # CHANGE THIS PATH if needed

print("Dataset loaded successfully.")
print(f"Shape of dataset: {df.shape}")
df.head()

# Step 4: Preprocessing
# Drop rows with missing critical values, especially target variables
df.dropna(subset=['DRUG 1', 'DRUG 2', 'MECHANISM OF INTERACTION', 'ALTERNATIVES', 'RISK RATING'], inplace=True)
print(f"Shape after dropping NA: {df.shape}")

# Standardize drug names (convert to string, uppercase, strip whitespace)
# This helps in consistent lookup and feature creation
df['DRUG 1'] = df['DRUG 1'].astype(str).str.upper().str.strip()
df['DRUG 2'] = df['DRUG 2'].astype(str).str.upper().str.strip()

# Create a canonical representation for drug pairs to handle (Drug A, Drug B) == (Drug B, Drug A)
# This will be used for creating a lookup dictionary of known interactions
df['PAIR_KEY'] = df.apply(lambda row: tuple(sorted((row['DRUG 1'], row['DRUG 2']))), axis=1)

# Store known interactions in a dictionary for quick lookup
known_interactions = {}
for _, row in df.iterrows():
    known_interactions[row['PAIR_KEY']] = {
        'mechanism': row['MECHANISM OF INTERACTION'],
        'alternatives': row['ALTERNATIVES'],
        'risk': row['RISK RATING']
    }
print(f"Number of unique known interaction pairs: {len(known_interactions)}")

# Prepare features for the model: combine DRUG 1 and DRUG 2 into a single text string
# Sorting them ensures that "DrugA DrugB" is treated the same as "DrugB DrugA" by the vectorizer
df['INPUT_FEATURES'] = df.apply(lambda row: ' '.join(sorted((row['DRUG 1'], row['DRUG 2']))), axis=1)

# Prepare target variables
y_risk = df['RISK RATING']
y_mechanism = df['MECHANISM OF INTERACTION']
y_alternatives = df['ALTERNATIVES']

# Encode target variables (since classifiers need numerical targets)
le_risk = LabelEncoder()
le_mechanism = LabelEncoder()
le_alternatives = LabelEncoder()

y_risk_encoded = le_risk.fit_transform(y_risk)
y_mechanism_encoded = le_mechanism.fit_transform(y_mechanism)
y_alternatives_encoded = le_alternatives.fit_transform(y_alternatives)

print(f"Unique risk ratings: {le_risk.classes_} ({len(le_risk.classes_)})")
print(f"Unique mechanisms: {len(le_mechanism.classes_)}") # Too many to print all
print(f"Unique alternatives: {len(le_alternatives.classes_)}") # Too many to print all


# Step 5: Train the models
# We will train three separate models.
# Using a Pipeline to combine TF-IDF vectorization and RandomForestClassifier.

# Features
X = df['INPUT_FEATURES']

# Split data (optional for final model, but good for seeing some performance metrics)
# For the final "production" model, you might train on ALL available data.
# Here, we'll split to show how to evaluate.
X_train, X_test, y_risk_train, y_risk_test, y_mech_train, y_mech_test, y_alt_train, y_alt_test = train_test_split(
    X, y_risk_encoded, y_mechanism_encoded, y_alternatives_encoded, test_size=0.2, random_state=42
)

# --- Model for RISK RATING ---
pipeline_risk = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), min_df=2)), # ngram_range and min_df can be tuned
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')) # class_weight='balanced' can help with imbalanced classes
])
print("\nTraining Risk Rating model...")
pipeline_risk.fit(X_train, y_risk_train)
# Evaluation (on the test set of known interactions)
y_risk_pred_test = pipeline_risk.predict(X_test)
print("\nRisk Rating Model Evaluation (on known interactions test set):")
# Handle cases where some classes might not be present in predictions for the report
# Get all unique labels from both true and predicted
risk_labels_for_report = np.unique(np.concatenate((y_risk_test, y_risk_pred_test)))
print(classification_report(y_risk_test, y_risk_pred_test, labels=risk_labels_for_report, target_names=le_risk.inverse_transform(risk_labels_for_report), zero_division=0))


# --- Model for MECHANISM OF INTERACTION ---
pipeline_mechanism = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), min_df=2)),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))
])
print("\nTraining Mechanism of Interaction model...")
pipeline_mechanism.fit(X_train, y_mech_train)
y_mech_pred_test = pipeline_mechanism.predict(X_test)
print("\nMechanism Model Evaluation (on known interactions test set):")
# This will have many classes, so accuracy might be low. The report will be very long.
# We'll just print overall accuracy.
from sklearn.metrics import accuracy_score
print(f"Accuracy: {accuracy_score(y_mech_test, y_mech_pred_test):.2f}")
# mech_labels_for_report = np.unique(np.concatenate((y_mech_test, y_mech_pred_test)))
# print(classification_report(y_mech_test, y_mech_pred_test, labels=mech_labels_for_report, target_names=le_mechanism.inverse_transform(mech_labels_for_report), zero_division=0))


# --- Model for ALTERNATIVES ---
pipeline_alternatives = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), min_df=2)),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))
])
print("\nTraining Alternatives model...")
pipeline_alternatives.fit(X_train, y_alt_train)
y_alt_pred_test = pipeline_alternatives.predict(X_test)
print("\nAlternatives Model Evaluation (on known interactions test set):")
print(f"Accuracy: {accuracy_score(y_alt_test, y_alt_pred_test):.2f}")
# alt_labels_for_report = np.unique(np.concatenate((y_alt_test, y_alt_pred_test)))
# print(classification_report(y_alt_test, y_alt_pred_test, labels=alt_labels_for_report, target_names=le_alternatives.inverse_transform(alt_labels_for_report), zero_division=0))

# ... (all your existing code up to model training) ...

print("\n--- Models trained. ---")
print("Note: Low accuracy for Mechanism/Alternatives is expected due to the high number of unique text categories.")
print("The 'predictive' mode for these will be an educated guess from the learned categories.")

# Step 5.1: Save the trained models and LabelEncoders
import joblib
import os

# Create a directory to save the models if it doesn't exist
model_save_path = '/content/ddi_models/' # Or /content/drive/MyDrive/ddi_models/ to save to Drive
os.makedirs(model_save_path, exist_ok=True)

# Save the pipelines (which include TF-IDF vectorizer and classifier)
joblib.dump(pipeline_risk, os.path.join(model_save_path, 'pipeline_risk.joblib'))
joblib.dump(pipeline_mechanism, os.path.join(model_save_path, 'pipeline_mechanism.joblib'))
joblib.dump(pipeline_alternatives, os.path.join(model_save_path, 'pipeline_alternatives.joblib'))
print("Trained pipelines saved.")

# Save the LabelEncoders
joblib.dump(le_risk, os.path.join(model_save_path, 'le_risk.joblib'))
joblib.dump(le_mechanism, os.path.join(model_save_path, 'le_mechanism.joblib'))
joblib.dump(le_alternatives, os.path.join(model_save_path, 'le_alternatives.joblib'))
print("LabelEncoders saved.")

# Save the known_interactions dictionary (useful for the Flask app)
joblib.dump(known_interactions, os.path.join(model_save_path, 'known_interactions.joblib'))
print("Known interactions dictionary saved.")

print(f"All models and supporting files saved to: {model_save_path}")

# You can then download this 'ddi_models' folder from Colab's file browser
# or directly access it if you saved it to your Google Drive.

# ... (rest of your example usage code, if you want to keep it) ...

Dataset loaded successfully.
Shape of dataset: (793, 5)
Shape after dropping NA: (793, 5)
Number of unique known interaction pairs: 698
Unique risk ratings: ['C' 'D' 'X'] (3)
Unique mechanisms: 413
Unique alternatives: 86

Training Risk Rating model...

Risk Rating Model Evaluation (on known interactions test set):
              precision    recall  f1-score   support

           D       0.97      0.88      0.92       138
           X       0.52      0.81      0.63        21

    accuracy                           0.87       159
   macro avg       0.74      0.85      0.78       159
weighted avg       0.91      0.87      0.89       159


Training Mechanism of Interaction model...

Mechanism Model Evaluation (on known interactions test set):
Accuracy: 0.30

Training Alternatives model...

Alternatives Model Evaluation (on known interactions test set):
Accuracy: 0.67

--- Models trained. ---
Note: Low accuracy for Mechanism/Alternatives is expected due to the high number of unique text ca

In [None]:
!pip install Flask
!pip install joblib
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.2.7-py3-none-any.whl.metadata (9.4 kB)
Downloading pyngrok-7.2.7-py3-none-any.whl (23 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.7


In [None]:
%%writefile app.py

from flask import Flask, request, jsonify
import joblib
import os
import pandas as pd # Though not directly used for prediction, good to have for consistency if needed
from pyngrok import ngrok, conf # Import pyngrok

# --- Configuration ---
# Path to the directory where you saved your models and encoders
MODEL_DIR = 'ddi_models/' # Assume ddi_models folder is in the same directory as app.py
                        # If not, provide the full path.

NGROK_AUTHTOKEN = " " # YOUR NGROK AUTHTOKEN

# --- Load Models and Encoders ONCE at startup ---
pipeline_risk, pipeline_mechanism, pipeline_alternatives = None, None, None
le_risk, le_mechanism, le_alternatives = None, None, None
known_interactions = {}

try:
    pipeline_risk = joblib.load(os.path.join(MODEL_DIR, 'pipeline_risk.joblib'))
    pipeline_mechanism = joblib.load(os.path.join(MODEL_DIR, 'pipeline_mechanism.joblib'))
    pipeline_alternatives = joblib.load(os.path.join(MODEL_DIR, 'pipeline_alternatives.joblib'))

    le_risk = joblib.load(os.path.join(MODEL_DIR, 'le_risk.joblib'))
    le_mechanism = joblib.load(os.path.join(MODEL_DIR, 'le_mechanism.joblib'))
    le_alternatives = joblib.load(os.path.join(MODEL_DIR, 'le_alternatives.joblib'))

    known_interactions = joblib.load(os.path.join(MODEL_DIR, 'known_interactions.joblib'))
    print("Models and encoders loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading model files: {e}")
    print(f"Ensure the '{MODEL_DIR}' directory exists and contains all .joblib files.")
    print("The application might not function correctly without these files.")
except Exception as e:
    print(f"An unexpected error occurred during model loading: {e}")

app = Flask(__name__)

def get_interaction_data(drug1, drug2):
    """
    Core logic to get interaction data, similar to the Colab function.
    """
    if not all([pipeline_risk, pipeline_mechanism, pipeline_alternatives, le_risk, le_mechanism, le_alternatives]):
        return {"error": "Models not loaded. Backend issue."}

    # Standardize input drug names
    d1_processed = str(drug1).upper().strip()
    d2_processed = str(drug2).upper().strip()

    pair_key = tuple(sorted((d1_processed, d2_processed)))

    if pair_key in known_interactions:
        interaction_data = known_interactions[pair_key]
        return {
            'DRUG 1': d1_processed,
            'DRUG 2': d2_processed,
            'MECHANISM OF INTERACTION': interaction_data['mechanism'],
            'ALTERNATIVES': interaction_data['alternatives'],
            'RISK RATING': interaction_data['risk'],
            'SOURCE': 'Known from dataset'
        }
    else:
        # Predictive mode
        input_feature_string = ' '.join(sorted((d1_processed, d2_processed)))

        risk_pred_encoded = pipeline_risk.predict([input_feature_string])[0]
        mech_pred_encoded = pipeline_mechanism.predict([input_feature_string])[0]
        alt_pred_encoded = pipeline_alternatives.predict([input_feature_string])[0]

        risk_pred = le_risk.inverse_transform([risk_pred_encoded])[0]
        mech_pred = le_mechanism.inverse_transform([mech_pred_encoded])[0]
        alt_pred = le_alternatives.inverse_transform([alt_pred_encoded])[0]

        return {
            'DRUG 1': d1_processed,
            'DRUG 2': d2_processed,
            'MECHANISM OF INTERACTION': mech_pred,
            'ALTERNATIVES': alt_pred,
            'RISK RATING': risk_pred,
            'SOURCE': 'Predictive (based on model learning)'
        }

@app.route('/predict_ddi', methods=['POST'])
def predict_ddi_endpoint():
    if not request.is_json:
        return jsonify({"error": "Request must be JSON"}), 400

    data = request.get_json()
    drug1 = data.get('drug1')
    drug2 = data.get('drug2')

    if not drug1 or not drug2:
        return jsonify({"error": "Missing 'drug1' or 'drug2' in request body"}), 400

    try:
        result = get_interaction_data(drug1, drug2)
        return jsonify(result)
    except Exception as e:
        # Log the exception for debugging
        app.logger.error(f"Error during prediction: {e}", exc_info=True)
        return jsonify({"error": "An internal server error occurred during prediction."}), 500


@app.route('/')
def home():
    return "Drug Interaction Prediction API is running!"

if __name__ == '__main__':
    # --- Pyngrok Configuration ---
    if not NGROK_AUTHTOKEN or "YOUR_NGROK_AUTHTOKEN" in NGROK_AUTHTOKEN : # Basic check
        print("ERROR: NGROK_AUTHTOKEN is not set correctly in app.py")
        print("The application will run locally only.")
        # Optionally, you could exit here or just run locally without ngrok
        # exit()
    else:
        try:
            conf.get_default().auth_token = NGROK_AUTHTOKEN
            public_url = ngrok.connect(5000) # Flask default port is 5000
            print(f" * ngrok tunnel \"{public_url}\" -> \"http://127.0.0.1:5000\"")
            print(f" * Access your API POST endpoint at: {public_url}/predict_ddi")
        except Exception as e:
            print(f"Could not start ngrok tunnel. Error: {e}")
            print("The application will run locally only.")

    # Run Flask app
    # Use '0.0.0.0' to make it accessible from your network, not just localhost
    # 'debug=True' is good for development, but turn off for production
    app.run(debug=True, host='0.0.0.0', port=5000)

Overwriting app.py


In [8]:
!python app.py

Models and encoders loaded successfully.
 * ngrok tunnel "NgrokTunnel: "https://a9f5-35-230-171-11.ngrok-free.app" -> "http://localhost:5000"" -> "http://127.0.0.1:5000"
 * Access your API POST endpoint at: NgrokTunnel: "https://a9f5-35-230-171-11.ngrok-free.app" -> "http://localhost:5000"/predict_ddi
 * Serving Flask app 'app'
 * Debug mode: on
 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
[33mPress CTRL+C to quit[0m
 * Restarting with stat
Models and encoders loaded successfully.
t=2025-05-08T14:52:18+0000 lvl=warn msg="can't bind default web address, trying alternatives" obj=web addr=127.0.0.1:4040
t=2025-05-08T14:52:18+0000 lvl=eror msg="failed to reconnect session" obj=tunnels.session err="authentication failed: Your account is limited to 1 simultaneous ngrok agent sessions.\nYou can run multiple simultaneous tunnels from a single agent session by defining the tunnels in your agent configuration file and starting 