In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


# data: synthetic notes with clear cues; balanced classes
diabetes = [
"Elevated blood sugar with polydipsia and polyuria."
"A1C above target; start or change insulin."
"Polyuria and polydipsia; plan metform."
"Postprandial hyperglycemia; lifestyle counseling.",
"Diabetic neuropathy of feet; ongoing insulin treatment"
"Hypoglycemia overnight; review carbohydrate counting.",
"Referral of retinopathy screening"
"Glucometer records of morning hyperglycemia"
"Ketones on urine dip; unintended weight loss"
"Foot exam decreased sensation; long-standing diabetes."
"Insulin pen directions; modify basal rate."
"High fasting blood glucose despite diet"
"Follow-up for diabetes; A1C improvement"
"CGM shows peaks after eating."
"Neuropathy burning pain; tighten glycemic control"
]

hypertension = [
"BP 150/95 with headaches; start ACE inhibitor."
"Hypertensive patient advised low-sodium DASH diet"
"clinic bp raised; ECG normal; no chest pain."
"Stage 2 hypertension; add thiazide diure"
"Mild edema; adjust antihypertensive meds.",
"Nocturnal hypertension; screen for sleep apnea."
"Occipital headaches w/ high BP; work-up to be planned"
"Systolic BP remains high; add Ca-channel blocker."
"Hypertensive urgency ruled out; reinforce adherence.",
"Home BP readings are persistently elevated."
"Start ARB for ACE cough back in two weeks."
"Suspect resistant hypertension; consider spironolactone."
"Lifestyle counseling on physical activity and sodium restriction."
"Ambulatory BP confirms daytime hypertension."
"Titrate medication to goal < 130/80"
]

checkup = [
"Annual exam; vaccines reviewed; no acute problems."
"Typical PT/physical exam;labs requested/discuss diet and exercise."
"Feels well; normal vitals; continue present regimen."
"Come for wellness; sleep and stress coaching."
"No major history; prevention screening was discussed."
"Follow-up for lifestyle; increased activity and balanced diet."
"Preventive care; BMI improved; encourage fluids."
"General check-up; normal exam; return in one year."
"Routine labs ordered; denies symptoms."
"Health maintenance; colon screening due."
"Vision and dental referrals; no new issues."
"Flu shot given; safety counseling.",
"Tdap updated; CBC and CMP ordered."
"Routine exam; follow healthy habits."
"No medication; check family history and vaccinate."]


#Analyzing imformation in the text
textual_information = diabetes + hypertension + checkup
appropriate_labels = (["Diabetes"]*len(diabetes)) + (["Hypertension"]*len(hypertension)) + (["General Check-up"]*len(checkup))

X_train_txt, X_test_txt, y_train, y_test = train_test_split(
    textual_information, appropriate_labels, test_size=3, random_state=42, stratify=appropriate_labels
)

tfidf = TfidfVectorizer(stop_words="english", ngram_range=(1,2))
X_train_Variable = tfidf.fit_transform(X_train_txt)
X_test_Variable = tfidf.transform(X_test_txt)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_Variable, y_train)


print("\nNotes:")
print("- Used TF-IDF with uni+bi-grams/English stopwords")
print("- Logistic Regression trained on sparse TF-IDF features.")
print("- Stratified split to keep classes even.")





#Figuring Out the Accuracy Score and Creating Classification Report
y_pred = clf.predict(X_test_Variable)
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc*100:.2f}%")
print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))



Notes:
- Used TF-IDF with uni+bi-grams/English stopwords
- Logistic Regression trained on sparse TF-IDF features.
- Stratified split to keep classes even.
Accuracy: 33.33%
Classification Report:
                  precision    recall  f1-score   support

        Diabetes       0.00      0.00      0.00         1
General Check-up       0.00      0.00      0.00         1
    Hypertension       0.33      1.00      0.50         1

        accuracy                           0.33         3
       macro avg       0.11      0.33      0.17         3
    weighted avg       0.11      0.33      0.17         3



Feature extraction transforms raw text into numerical vectors. Informative features retain the informative words/phrases, remove noise, and enable the model to separate classes efficiently.





Naive Bayes Pros: Very speedy, solid baseline for high-dim sparse text. Cons: assumes variables are independent and thus potentially ignores interactions.