In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import sklearn as skl
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, precision_recall_curve

In [2]:
df = pd.read_csv('data/one_hot.csv')

In [3]:
df['Irritability']

0       0
1       0
2       0
3       0
4       0
       ..
4915    0
4916    0
4917    0
4918    0
4919    0
Name: Irritability, Length: 4920, dtype: int64

Idea: create model that predicts irregularity based on disease and symptoms

How: create training dataset by sampling from original, adding symptoms for irregular examples and subtracting for non-irregular

In [4]:
irregular_symptoms_all = (df.groupby('Disease').sum() == 0)
irregular_symptoms_all.head()

Unnamed: 0_level_0,(Vertigo) Paroymsal Positional Vertigo,Abdominal Pain,Abnormal Menstruation,Acidity,Acne,Acute Liver Failure,Aids,Alcoholic Hepatitis,Allergy,Altered Sensorium,...,Vomiting,Watering From Eyes,Weakness In Limbs,Weakness Of One Body Side,Weight Gain,Weight Loss,Yellow Crust Ooze,Yellow Urine,Yellowing Of Eyes,Yellowish Skin
Disease,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(vertigo) Paroymsal Positional Vertigo,False,True,True,True,True,True,True,True,True,True,...,False,True,True,True,True,True,True,True,True,True
AIDS,True,True,True,True,True,True,False,True,True,True,...,True,True,True,True,True,True,True,True,True,True
Acne,True,True,True,True,False,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
Alcoholic hepatitis,True,False,True,True,True,True,True,False,True,True,...,False,True,True,True,True,True,True,True,True,False
Allergy,True,True,True,True,True,True,True,True,False,True,...,True,False,True,True,True,True,True,True,True,True


In [5]:
symptoms = df.iloc[:, 1:]
symptoms

Unnamed: 0,(Vertigo) Paroymsal Positional Vertigo,Abdominal Pain,Abnormal Menstruation,Acidity,Acne,Acute Liver Failure,Aids,Alcoholic Hepatitis,Allergy,Altered Sensorium,...,Vomiting,Watering From Eyes,Weakness In Limbs,Weakness Of One Body Side,Weight Gain,Weight Loss,Yellow Crust Ooze,Yellow Urine,Yellowing Of Eyes,Yellowish Skin
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4916,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4917,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4918,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
diseases = pd.get_dummies(df['Disease'])
diseases

Unnamed: 0,(vertigo) Paroymsal Positional Vertigo,AIDS,Acne,Alcoholic hepatitis,Allergy,Arthritis,Bronchial Asthma,Cervical spondylosis,Chicken pox,Chronic cholestasis,...,Osteoarthristis,Paralysis (brain hemorrhage),Peptic ulcer diseae,Pneumonia,Psoriasis,Tuberculosis,Typhoid,Urinary tract infection,Varicose veins,hepatitis A
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4916,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4917,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
4918,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False


In [7]:
np.random.seed(42)
irregular_indices = np.random.choice(range(diseases.shape[0]), size=5000)

In [8]:
result = []
for index in irregular_indices:
    reg = symptoms.iloc[index]
    irr = symptoms.columns[~reg.astype(bool)]
    disease = diseases.columns[index]
    while True:
        irr_symptom = np.random.choice(irr)
        if (irregular_symptoms_all.loc[disease, irr_symptom]):
            reg[irr_symptom] = 1
            break
        else:
            continue
    result.append(reg)
gen_irr_symptoms = pd.concat(result, axis=1).T

sampled_diseases = diseases.loc[irregular_indices]
gen_irr_symptoms = pd.concat([sampled_diseases, gen_irr_symptoms], axis=1)
gen_irr_symptoms['Irregular'] = 1
gen_irr_symptoms

IndexError: index 860 is out of bounds for axis 0 with size 41

In [None]:
gen_reg_symptoms = diseases.sample(5000, replace=True)
sampled_symptoms = symptoms.loc[gen_reg_symptoms.index]
gen_reg_symptoms = pd.concat([gen_reg_symptoms, sampled_symptoms], axis=1)
gen_reg_symptoms['Irregular'] = 0
gen_reg_symptoms

In [9]:
reg_irreg = pd.concat([gen_reg_symptoms, gen_irr_symptoms])
reg_irreg

NameError: name 'gen_reg_symptoms' is not defined

In [10]:
model = LogisticRegression()
X = reg_irreg.iloc[:, :-1]
y = reg_irreg['Irregular']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
model.fit(X_train, y_train)
model.score(X_test, y_test)

NameError: name 'reg_irreg' is not defined

In [11]:
test_probs_irr = model.predict_proba(X_test)[:, 1]
test_pred_irr = model.predict(X_test)

NameError: name 'X_test' is not defined

In [12]:
# sns.histplot(test_probs_irr, bins=30)

In [13]:
thresh = 0.4
test_pred_irr = (test_probs_irr >= thresh)
tp_count = (test_pred_irr & y_test).sum()
fp_count = (test_pred_irr & ~y_test).sum()
tn_count = (~test_pred_irr & ~y_test).sum()
fn_count = (~test_pred_irr & y_test).sum()

prec = tp_count / (tp_count + fp_count)
rec = tp_count / (tp_count + fn_count)
prec, rec

NameError: name 'test_probs_irr' is not defined

In [14]:
fpr, tpr, thresh = roc_curve(y_test, test_probs_irr)
# sns.lineplot(fpr, tpr)

NameError: name 'y_test' is not defined

In [15]:
precs, recs, thresh = precision_recall_curve(y_test, test_probs_irr)
sns.lineplot(x=recs, y=precs)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.show()

NameError: name 'y_test' is not defined

In [16]:
import pickle

# Specify the filename where the model will be saved
filename = 'my_model.pkl'

# Open the file in write-binary mode and save the model
with open(filename, 'wb') as file:
    pickle.dump(model, file)


In [17]:
X = reg_irreg.iloc[42, :-1].to_numpy().reshape(1, -1)
model.predict(X)

NameError: name 'reg_irreg' is not defined