In [1]:
import pandas as pd
import re
import joblib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier

df = pd.read_csv("A_Z_medicines_dataset_of_India.csv")

df["price"] = pd.to_numeric(df["price"], errors="coerce")

def clean_composition(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"[()\[\],]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["composition"] = (
    df["short_composition1"].fillna("") + " " +
    df["short_composition2"].fillna("")
)

df["clean_composition"] = df["composition"].apply(clean_composition)
df = df[df["clean_composition"] != ""].reset_index(drop=True)

df["clean_name"] = df["name"].str.lower().str.strip()

def extract_dosage(text):
    return " ".join(
        re.findall(r"\d+\s*mg|\d+\s*ml|\d+\s*mcg", text.lower())
    )

df["dosage"] = df["name"].apply(extract_dosage)

df["drug_group"] = df["clean_composition"].astype("category").cat.codes

group_counts = df["drug_group"].value_counts()
valid_groups = group_counts[group_counts > 1].index
df = df[df["drug_group"].isin(valid_groups)].reset_index(drop=True)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df["drug_group"])

tfidf = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=8000
)

X = tfidf.fit_transform(df["clean_composition"])

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

model = SGDClassifier(
    loss="log_loss",
    max_iter=1000,
    tol=1e-3,
    n_jobs=-1
)

model.fit(X_train, y_train)

joblib.dump(model, "drug_recommendation_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")

model = joblib.load("drug_recommendation_model.pkl")
tfidf = joblib.load("tfidf_vectorizer.pkl")
label_encoder = joblib.load("label_encoder.pkl")

def recommend_alternatives_ml(medicine_name, top_n=10):
    key = medicine_name.lower().strip()

    exact_match = df[df["clean_name"] == key]

    if not exact_match.empty:
        row = exact_match.iloc[0]
    else:
        matches = df[df["clean_name"].str.contains_]()]()


  ys_types = set(type_of_target(x) for x in ys)


In [2]:
import numpy as np

def batch_predict(model, X, batch_size=2000):
    predictions = []
    for i in range(0, X.shape[0], batch_size):
        X_batch = X[i:i + batch_size]
        predictions.append(model.predict(X_batch))
    return np.concatenate(predictions)


In [3]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = batch_predict(model, X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="weighted")
recall = recall_score(y_test, y_pred, average="weighted")
f1 = f1_score(y_test, y_pred, average="weighted")

print("Accuracy :", accuracy)
print("Precision:", precision)
print("Recall   :", recall)
print("F1 Score :", f1)


Accuracy : 0.7825667804780077
Precision: 0.6477126283929029
Recall   : 0.7825667804780077
F1 Score : 0.701268366710751


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
