<a href="https://colab.research.google.com/github/AryanAhmadChaudhary/Hybrid-ADR-Detection-Rule-Mining-Machine-Learning/blob/main/ADR_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DEPENDENCIES


In [None]:
!pip install datasets mlxtend nltk

In [None]:
import nltk
nltk.download('punkt_tab')
nltk.download("punkt")
nltk.download("stopwords")

In [None]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from mlxtend.frequent_patterns import apriori, association_rules

  return datetime.utcnow().replace(tzinfo=utc)


# LOAD DATASET

In [None]:
from datasets import load_dataset

dataset = load_dataset("SetFit/ade_corpus_v2_classification")

train_df = pd.DataFrame(dataset["train"])
test_df = pd.DataFrame(dataset["test"])

df = pd.concat([train_df, test_df]).reset_index(drop=True)

df.drop("label_text", axis=1, inplace=True)

df.head()


# TEXT CLEANING

In [None]:
stop_words = set(stopwords.words("english"))

def clean_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
    return " ".join(tokens)

df["clean_text"] = df["text"].apply(clean_text)


# RULE-BASED MODEL (Apriori)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from mlxtend.frequent_patterns import apriori, association_rules

vectorizer_rules = CountVectorizer(max_features=300, binary=True)
word_matrix = vectorizer_rules.fit_transform(df["clean_text"]).toarray()

words_df = pd.DataFrame(word_matrix, columns=vectorizer_rules.get_feature_names_out())
words_df["ADR"] = df["label"]

frequent = apriori(words_df, min_support=0.01, use_colnames=True)

rules = association_rules(frequent, metric="lift", min_threshold=1.1)


rule_triggers = {}
for _, row in rules.iterrows():
    if row["consequents"] == frozenset({"ADR"}) and len(row["antecedents"]) == 1:
        word = list(row["antecedents"])[0]
        conf = row["confidence"]
        rule_triggers[word] = conf




## Rule-Based Predictor (returns prediction + confidence)

In [None]:
def rule_based_predict_with_conf(text):
    tokens = clean_text(text).split()
    best_conf = 0

    for word in tokens:
        if word in rule_triggers:
            conf = rule_triggers[word]
            if conf > best_conf:
                best_conf = conf

    if best_conf > 0:
        return 1, best_conf
    return 0, 0


# TRADITIONAL ML MODELS

## Data Split

In [None]:
from sklearn.model_selection import train_test_split

vectorizer = CountVectorizer(max_features=2000)
X = vectorizer.fit_transform(df["clean_text"])
y = df["label"]

X_train, X_test, y_train, y_test, text_train, text_test = train_test_split(
    X, y, df["text"], test_size=0.2, random_state=42
)


## Models Trainig

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(max_iter=300)
lr_model.fit(X_train, y_train)


### Support vector Machine (SVM)

In [None]:
from sklearn.svm import LinearSVC

svm_model = LinearSVC()
svm_model.fit(X_train, y_train)


## Models Predictions

### Logistic regression

In [None]:
lr_pred = lr_model.predict(X_test)

### SVM

In [None]:
svm_pred = svm_model.predict(X_test)

# HYBRID MODELS (Rule + ML)

##Hybrid Logistic Regression

In [None]:
def hybrid_lr_predict(text):
    rule_pred, rule_conf = rule_based_predict_with_conf(text)
    ml_pred = lr_model.predict(vectorizer.transform([text]))[0]

    if rule_pred == 1 and rule_conf >= 0.80:
        return 1
    return ml_pred



##Hybrid SVM

In [None]:
def hybrid_svm_predict(text):
    rule_pred, rule_conf = rule_based_predict_with_conf(text)
    ml_pred = svm_model.predict(vectorizer.transform([text]))[0]

    if rule_pred == 1 and rule_conf >= 0.80:
        return 1
    return ml_pred


## Hybrid predictions

In [None]:
hybrid_lr_preds  = [hybrid_lr_predict(t)  for t in text_test]
hybrid_svm_preds = [hybrid_svm_predict(t) for t in text_test]


# EVALUATION

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def evaluate(name, y_true, y_pred):
    return {
        "Model": name,
        "Accuracy": accuracy_score(y_true, y_pred),
        "F1 Score": f1_score(y_true, y_pred)
    }

results = []


## Rule-Based

In [None]:

rule_preds = [rule_based_predict_with_conf(t)[0] for t in text_test]
results.append(evaluate("Rule-Based", y_test, rule_preds))


## ML Models

### Logistic Regression

In [None]:
results.append(evaluate("Logistic Regression", y_test, lr_pred))


### SVM

In [None]:
results.append(evaluate("SVM", y_test, svm_pred))


## Hybrid Models

### Hybrid Logistic Regression

In [None]:
results.append(evaluate("Hybrid (Rule + LR)", y_test, hybrid_lr_preds))


### Hybrid SVM

In [None]:
results.append(evaluate("Hybrid (Rule + SVM)", y_test, hybrid_svm_preds))


# FINAL COMPARISON TABLE

In [None]:
df_results = pd.DataFrame(results)
df_results
