*INSTALLING DEPENDANCIES AND IMPORTING LIBRARIES*

In [2]:

import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


  from .autonotebook import tqdm as notebook_tqdm


*UPLOADING DRIVE AND READING DATASET*

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
df = pd.read_csv(r'Symptom2Disease.csv')

*DATA EXPLARATORY*

In [4]:
#checking the first 5 rows of the dataset
df.head()

Unnamed: 0.1,Unnamed: 0,label,text
0,0,Psoriasis,I have been experiencing a skin rash on my arm...
1,1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,2,Psoriasis,I have been experiencing joint pain in my fing...
3,3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,4,Psoriasis,"My nails have small dents or pits in them, and..."


In [5]:
#checking the 5 last row of the dataset
df.tail()

Unnamed: 0.1,Unnamed: 0,label,text
1195,295,diabetes,I'm shaking and trembling all over. I've lost ...
1196,296,diabetes,"Particularly in the crevices of my skin, I hav..."
1197,297,diabetes,I regularly experience these intense urges and...
1198,298,diabetes,"I have trouble breathing, especially outside. ..."
1199,299,diabetes,I constantly sneeze and have a dry cough. My i...


In [6]:
#checking the shape of the dataset
df.shape

(1200, 3)

In [7]:
#Information about the DataFrame, including the data types and non-null values
df.describe()

Unnamed: 0.1,Unnamed: 0
count,1200.0
mean,149.5
std,86.638166
min,0.0
25%,74.75
50%,149.5
75%,224.25
max,299.0


In [8]:
#display the information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  1200 non-null   int64 
 1   label       1200 non-null   object
 2   text        1200 non-null   object
dtypes: int64(1), object(2)
memory usage: 28.3+ KB


In [9]:
#checking if the dataset  has null values
df.isnull().sum()

Unnamed: 0    0
label         0
text          0
dtype: int64

In [10]:
df.iloc[0,1]

'Psoriasis'

*DATA CLEANING*

In [11]:
df = df.drop('Unnamed: 0', axis=1)


In [12]:
df.head()

Unnamed: 0,label,text
0,Psoriasis,I have been experiencing a skin rash on my arm...
1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,Psoriasis,I have been experiencing joint pain in my fing...
3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,Psoriasis,"My nails have small dents or pits in them, and..."


*FEATURE ENGINEERING*


In [13]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [14]:
features = model.encode(df.iloc[:,1])

In [38]:
import pickle
pickle.dump(model, open('sentence_encoding.pkl', 'wb'))

*COSINE SIMILARITY*

In [16]:
test_0 = model.encode(df.iloc[0,1]).reshape(1,-1)
test_0.shape

(1, 384)

In [17]:
test_1 = model.encode(df.iloc[444,1]).reshape(1,-1)
test_1.shape

(1, 384)

In [18]:
metrics.pairwise.cosine_similarity(test_0, test_1)

array([[0.18148914]], dtype=float32)

*CREATING LABELS AND TARGETS FOR CLASSIFICATION*

In [19]:
r_targets = df.iloc[:,1].values

In [20]:
le = LabelEncoder()
targets = le.fit_transform(r_targets)

In [39]:
pickle.dump(le, open('label_encoder.pkl', 'wb'))

In [22]:
features.shape

(1200, 384)

*TRAIN TEST SPLITTING*

In [23]:
X_train, X_test, y_train, y_test = train_test_split(features, targets, train_size=0.75)

In [24]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(900, 384)
(300, 384)
(900,)
(300,)


*MODEL*

In [25]:
clf = MLPClassifier(max_iter=1000)
clf.fit(X_train, y_train)

In [26]:

clf_knn = KNeighborsClassifier(n_neighbors=3)
clf_knn.fit(X_train, y_train)


In [27]:
clf_svm = svm.SVC()
clf_svm.fit(X_train, y_train)


# KNN

## Model Evaluation

In [28]:
clf.score(X_train,y_train)

1.0

In [29]:
clf.score(X_test, y_test)

0.06666666666666667

In [30]:
clf_svm.score(X_train,y_train)

0.5733333333333334

In [31]:
clf_svm.score(X_test, y_test)

0.023333333333333334

In [32]:
clf_knn.score(X_train,y_train)

0.3488888888888889

In [33]:
clf_knn.score(X_test, y_test)

0.03666666666666667

In [34]:
#Saving the model to a pickle file
pickle.dump(clf, open('classification_model.pkl', 'wb'))

*CREATING A PIPELINE*

In [35]:
def disease_classification(symptom_text):
    model = pickle.load(open('sentence_encoding.sav', 'rb'))
    class_model = pickle.load(open('classification_model.sav', 'rb'))
    label_encoder = pickle.load(open('label_encoder.sav', 'rb'))

    temp_encoding = model.encode(symptom_text)
    temp_prediction = class_model.predict([temp_encoding])
    temp_label = label_encoder.inverse_transform(temp_prediction)

    return temp_label[0]

In [36]:
symptom_text = 'Dry, thick, and raised patches on the skin are the most common sign of psoriasis. These patches are often covered with a silvery-white coating called scale, and they tend to itch.'

In [37]:
disease_classification(symptom_text)

"My skin has been really itchy and there are these rashy spots all over. There are also some patches that look different in color than the rest of my skin, and I've had some bumps that are kind of hard."