In [None]:
import pandas as pd
import numpy as np

import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from  sklearn import svm

In [None]:
nltk.download('gutenberg')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

**Reading the medical dialog dataset**

In [None]:
## reading the data into a dataframe
healthcare=pd.DataFrame(columns=['id','description','patient','doctor'])

id=-1
patient=''
doctor=''
description=''
f = open("/content/drive/MyDrive/Disease_Prediction_dataset/healthcaremagic_dialogue_.txt", "r")
txt=""
reading=False
for x in f:
  if healthcare.shape[0] == 10000:
    break
  if x[:2]=='id':
    id=x[3:]
    #ids.append(x[3:])
    reading=False
  elif x[:11] == "Description":
    if txt == "":
      reading=True
    else:
      doctor = txt
      txt=""
  elif x[:7] == "Patient":
    if txt == "":
      reading=True
    else:
      description= txt
      txt=""
  elif x[:6] == "Doctor":
    if txt == "":
      reading=True
    else:
      patient=txt
      txt=""
  elif x[:6] == "Dialog" or x=="\n":
    reading=reading

  else :
    txt= txt + x
  if id != -1 and patient != "" and description != "" and doctor !="":
    healthcare= healthcare.append({'id':id,'patient':patient,'doctor':doctor,'description':description}, ignore_index=True)
    id=-1
    patient=''
    doctor=''
    description=''

In [None]:
healthcare.head()

Unnamed: 0,id,description,patient,doctor
0,0\n,How can one treat intense pain in the stomach ...,"Hi,I m XXXX,I am ulcer patient ,I did my endos...",https://www.healthcaremagic.com/questions/How-...
1,1\n,What causes abdominal pain similar to menstrua...,I took the shot and started duphaston pills fo...,"Hello,I don't think your ulcer is coming back ..."
2,2\n,Suggest remedies for recurrent abdominal pain ...,Hi my name is XXXX. I have been having stomach...,"Hello,Duphaston is a hormonal preparation and ..."
3,3\n,What causes sensation of weird movements in th...,"Hi, my name is XXXX I m a 19year old girl and ...","Hello,I read carefully your query and understa..."
4,4\n,Suggest remedies for recurrent pain in the abd...,Hi! My name is XXXX and I used to take an insa...,"Hi,There can be numerous clinical conditions t..."


In [None]:
healthcare.shape[0]

10000

**reading movie lines data**

In [None]:
movie_lines= pd.read_csv("/content/drive/MyDrive/Disease_Prediction_dataset/movie_lines.tsv",sep="\t",nrows=10000,header=None)

movie_lines.head()

Unnamed: 0,0,1,2,3,4
0,L1045,u0,m0,BIANCA,They do not!
1,L1044,u2,m0,CAMERON,They do to!
2,L985,u0,m0,BIANCA,I hope so.
3,L984,u2,m0,CAMERON,She okay?
4,L925,u0,m0,BIANCA,Let's go.


In [None]:
medicaldf= pd.DataFrame({"text":healthcare['description'],"medical":[1]*len(healthcare['description'])})


In [None]:
moviesdf= pd.DataFrame({"text":movie_lines[4],"medical":[0]*len(movie_lines[4])})

## concatinating the 2 datasets and splitting them into training and testing datasets

In [None]:
df=pd.concat([medicaldf,moviesdf])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     19909 non-null  object
 1   medical  20000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 468.8+ KB


In [None]:
df=df.dropna()

**a function that tokanize, stem and lemmatize the text**

In [None]:
#cleaning
def clean_text(text):
  tokenizer = nltk.RegexpTokenizer(r"\w+")
  tokenized_words = tokenizer.tokenize(text)
  tokenized_words = [token.lower() for token in tokenized_words]
  stop_words=set(stopwords.words("english"))
  filtered_words=[]
  for w in tokenized_words:
      if w not in stop_words:
          filtered_words.append(w)
  ps = PorterStemmer()
  wl=WordNetLemmatizer()
  stemmed=[]
  for w in filtered_words:
    st=ps.stem(w)
    stemmed.append(wl.lemmatize(st))
  return ' '.join(stemmed)


In [None]:
df['cleantext']=df['text'].apply(clean_text)

In [None]:
df.head()

Unnamed: 0,text,medical,cleantext
0,How can one treat intense pain in the stomach ...,1,one treat intens pain stomach suffer ulcer
1,What causes abdominal pain similar to menstrua...,1,caus abdomin pain similar menstrual pain take ...
2,Suggest remedies for recurrent abdominal pain ...,1,suggest remedi recurr abdomin pain block bowel
3,What causes sensation of weird movements in th...,1,caus sensat weird movement abdomen along sharp...
4,Suggest remedies for recurrent pain in the abd...,1,suggest remedi recurr pain abdomen chest heart...


In [None]:
# Splitting the data into train and  test with ratio 80 : 20 
features= np.array(df['cleantext'])
labels=np.array(df['medical'])
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, shuffle=True)
print(X_train.shape)
print(X_test.shape)

(15927,)
(3982,)


## the model

In [None]:

# TFIDF transformation then decisionTree classifier with criterion="entropy"

pipe_DT= Pipeline([('tfidf', TfidfVectorizer()),
               ('clf',svm.SVC(kernel='linear', C=1)),
              ])
pipe_DT.fit(X_train, y_train)
y_pred_DT = pipe_DT.predict(X_test)

#Evaluation
print('\033[1m{:10s}\033[0m'.format('decision_tree_entropoy_model'))
print("Report : ",classification_report(y_test, y_pred_DT))
print('\033[1m{:10s}\033[0m'.format("Accuracy:"),metrics.accuracy_score(y_test, y_pred_DT))

[1mdecision_tree_entropoy_model[0m
Report :                precision    recall  f1-score   support

           0       0.99      1.00      0.99      1919
           1       1.00      0.99      0.99      2063

    accuracy                           0.99      3982
   macro avg       0.99      0.99      0.99      3982
weighted avg       0.99      0.99      0.99      3982

[1mAccuracy: [0m 0.9914615770969362


In [None]:
from joblib import dump, load
dump(pipe_DT, 'medical_relevence_classifier.joblib') 

['medical_relevence_classifier.joblib']

In [None]:
clf = load('medical_relevence_classifier.joblib')

In [None]:
def predict_text(msg):
  clean_msg = clean_text(msg)
  pred = clf.predict([clean_msg])
  return pred

In [None]:
predict_text('my stomach hurts')

array([1])

In [None]:
predict_text('i have pain')

array([1])

In [None]:
predict_text('i am very bored')

array([0])

In [None]:
!pip install -U scikit-learn

