In [6]:
!pip install pandas nltk spacy scikit-learn seaborn matplotlib autoai-libs

Collecting spacy
  Using cached spacy-3.8.4-cp312-cp312-win_amd64.whl.metadata (27 kB)


ERROR: Could not find a version that satisfies the requirement autoai-libs (from versions: none)
ERROR: No matching distribution found for autoai-libs


In [7]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression  # Import Logistic Regression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt_tab', quiet=True)
nlp = spacy.load("en_core_web_sm")

def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    cleaned_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return " ".join(cleaned_tokens)

def expand_abbreviations(text):
    abbrev_dict = {
        "MI": "Myocardial Infarction",
        "DM2": "Type 2 Diabetes",
        "HTN": "Hypertension",
        "CAD": "Coronary Artery Disease",
    }
    for abbrev, expansion in abbrev_dict.items():
        text = text.replace(abbrev, expansion)
    return text

def normalize_concepts(text):
    concept_map = {
        "heart attack": "Myocardial Infarction",
        "high blood pressure": "Hypertension",
        "type ii diabetes": "Type 2 Diabetes",
    }
    for concept, normalized in concept_map.items():
        text = text.replace(concept, normalized)
    return text

def extract_entities(text):
    if isinstance(text, str):
        doc = nlp(text)
        return [(ent.text, ent.label_) for ent in doc.ents]
    else:
        return []

try:
    df = pd.read_csv('healthcare_data.csv')
except FileNotFoundError:
    print("Error: healthcare_data.csv not found.")
    exit()

df['cleaned_description'] = df['description'].apply(clean_text)
df['expanded_description'] = df['cleaned_description'].apply(expand_abbreviations)
df['normalized_description'] = df['expanded_description'].apply(normalize_concepts)
df['entities'] = df['description'].apply(extract_entities)

df = df.dropna(subset=['diagnosis'])
X = df['normalized_description']
y = df['diagnosis']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# 1. Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_param_grid = {
    'n_estimators': [50, 100, 200],  # Reduced for demonstration
    'max_depth': [None, 10, 20],      # Reduced for demonstration
}
rf_grid_search = GridSearchCV(rf_model, rf_param_grid, cv=3, n_jobs=-1, verbose=1)
rf_grid_search.fit(X_train_tfidf, y_train)
rf_best_model = rf_grid_search.best_estimator_

rf_y_pred = rf_best_model.predict(X_test_tfidf)
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, rf_y_pred))

# 2. Logistic Regression
lr_model = LogisticRegression(random_state=42, max_iter=1000)  # Increased max_iter
lr_param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2']
}
lr_grid_search = GridSearchCV(lr_model, lr_param_grid, cv=3, n_jobs=-1, verbose=1)
lr_grid_search.fit(X_train_tfidf, y_train)
lr_best_model = lr_grid_search.best_estimator_

lr_y_pred = lr_best_model.predict(X_test_tfidf)
print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, lr_y_pred))

# Model Comparison Table
results = {
    'Model': ['Random Forest', 'Logistic Regression'],
    'Accuracy': [accuracy_score(y_test, rf_y_pred), accuracy_score(y_test, lr_y_pred)]
}

results_df = pd.DataFrame(results)
print("\nModel Comparison:")
print(results_df)

# Confusion Matrix (for best model - change as needed)
cm = confusion_matrix(y_test, rf_y_pred)  # Change model as needed
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix (Random Forest)")  # Change title as needed
plt.show()

ModuleNotFoundError: No module named 'pandas'