## Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, learning_curve, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from joblib import dump, load
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

## Load dataset

In [2]:
file_path = 'D:/Bimar-backend/AI/archive/Final_Augmented_dataset_Diseases_and_Symptoms.csv'

df = pd.read_csv(file_path)
print('Data Preview:')
df.head(20)

Data Preview:


Unnamed: 0,diseases,anxiety and nervousness,depression,shortness of breath,depressive or psychotic symptoms,sharp chest pain,dizziness,insomnia,abnormal involuntary movements,chest tightness,...,stuttering or stammering,problems with orgasm,nose deformity,lump over jaw,sore in nose,hip weakness,back swelling,ankle stiffness or tightness,ankle weakness,neck weakness
0,panic disorder,1,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,panic disorder,0,0,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,panic disorder,1,1,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,panic disorder,1,0,0,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,panic disorder,1,1,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
5,panic disorder,0,0,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,panic disorder,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,panic disorder,0,0,0,1,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
8,panic disorder,1,0,0,1,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
9,panic disorder,1,1,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


### Dataframe shape

In [3]:
df.shape

(246945, 378)

### Check for duplicates

In [4]:
df.duplicated().value_counts()

False    189647
True      57298
Name: count, dtype: int64

### Remove duplicated rows and columns

In [5]:
df = df.drop_duplicates().reset_index(drop=True)
df.duplicated().sum()

np.int64(0)

In [6]:
df.columns

Index(['diseases', 'anxiety and nervousness', 'depression',
       'shortness of breath', 'depressive or psychotic symptoms',
       'sharp chest pain', 'dizziness', 'insomnia',
       'abnormal involuntary movements', 'chest tightness',
       ...
       'stuttering or stammering', 'problems with orgasm', 'nose deformity',
       'lump over jaw', 'sore in nose', 'hip weakness', 'back swelling',
       'ankle stiffness or tightness', 'ankle weakness', 'neck weakness'],
      dtype='object', length=378)

### Replace spaces with underscores in column names

In [7]:
df.columns = df.columns.str.replace(' ', '_')
df.columns

Index(['diseases', 'anxiety_and_nervousness', 'depression',
       'shortness_of_breath', 'depressive_or_psychotic_symptoms',
       'sharp_chest_pain', 'dizziness', 'insomnia',
       'abnormal_involuntary_movements', 'chest_tightness',
       ...
       'stuttering_or_stammering', 'problems_with_orgasm', 'nose_deformity',
       'lump_over_jaw', 'sore_in_nose', 'hip_weakness', 'back_swelling',
       'ankle_stiffness_or_tightness', 'ankle_weakness', 'neck_weakness'],
      dtype='object', length=378)

## Text Column Encoding.

In [8]:
le = LabelEncoder()
df['label'] = le.fit_transform(df['diseases'])
# Check result
print(df[['diseases', 'label']].head(10))

         diseases  label
0  panic disorder    531
1  panic disorder    531
2  panic disorder    531
3  panic disorder    531
4  panic disorder    531
5  panic disorder    531
6  panic disorder    531
7  panic disorder    531
8  panic disorder    531
9  panic disorder    531


## Split features and target

In [9]:
# Remove the diseases column
df = df.drop(["diseases"], axis=1).copy()
df.head()

X = df.drop(["label"], axis=1)
y = df["label"]

In [10]:
X.shape, y.shape

((189647, 377), (189647,))

In [11]:
X.head()
X.columns
dump(list(X.columns), "symptom_columns.pkl")


['symptom_columns.pkl']

In [12]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
x_train.shape, x_test.shape, y_test.shape, y_train.shape

((151717, 377), (37930, 377), (37930,), (151717,))

## Create and train LinearSVC model

In [13]:
print("\nTraining Linear SVC model...")
svc_model = LinearSVC(max_iter=1000)
svc_model.fit(x_train, y_train)

# Predict and evaluate
svc_preds = svc_model.predict(x_test)
svc_accuracy = accuracy_score(y_test, svc_preds)
print(f"Linear SVC Accuracy: {svc_accuracy:.4f}")
print(classification_report(y_test, svc_preds))


Training Linear SVC model...
Linear SVC Accuracy: 0.8308
              precision    recall  f1-score   support

           0       0.50      0.60      0.55         5
           1       0.94      0.96      0.95        49
           2       0.73      0.76      0.75        29
           3       1.00      1.00      1.00         1
           4       0.75      0.84      0.79        25
           5       0.00      0.00      0.00         2
           6       1.00      1.00      1.00         1
           7       0.00      0.00      0.00         4
           8       0.80      0.65      0.72        82
           9       0.96      0.69      0.80       155
          10       0.92      0.93      0.93       238
          11       0.77      0.64      0.70       247
          12       0.68      0.81      0.74       166
          14       0.33      0.27      0.30        30
          15       0.97      0.92      0.95       168
          16       0.72      0.81      0.76       108
          17       0.77

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
print("Saving trained model and symptom list...")
dump(svc_model, "new_doctor_specialty_model.joblib")
dump(list(X.columns), "new_symptom_list.joblib")
print("Model and symptom list saved successfully!")

Saving trained model and symptom list...
Model and symptom list saved successfully!


In [15]:
X.head()

Unnamed: 0,anxiety_and_nervousness,depression,shortness_of_breath,depressive_or_psychotic_symptoms,sharp_chest_pain,dizziness,insomnia,abnormal_involuntary_movements,chest_tightness,palpitations,...,stuttering_or_stammering,problems_with_orgasm,nose_deformity,lump_over_jaw,sore_in_nose,hip_weakness,back_swelling,ankle_stiffness_or_tightness,ankle_weakness,neck_weakness
0,1,0,1,1,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,1,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,1,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
