In [None]:
import numpy as np
import pandas as pd
import re

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Supportiv/intern_screening_dataset.csv")
df.head()

Unnamed: 0,question,answer
0,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...
1,What is (are) Glaucoma ?,The optic nerve is a bundle of more than 1 mil...
2,What is (are) Glaucoma ?,Open-angle glaucoma is the most common form of...
3,Who is at risk for Glaucoma? ?,Anyone can develop glaucoma. Some people are a...
4,How to prevent Glaucoma ?,"At this time, we do not know how to prevent gl..."


In [None]:
df.shape

(16406, 2)

In [None]:
df.isna().sum()

question    0
answer      5
dtype: int64

In [None]:
df[df['answer'].isna()]

Unnamed: 0,question,answer
3587,What is (are) HELLP syndrome ?,
3836,What is (are) X-linked lymphoproliferative syn...,
4196,What is (are) Familial HDL deficiency ?,
4429,What is (are) Emery-Dreifuss muscular dystroph...,
6689,What is (are) Emery-Dreifuss muscular dystroph...,


In [None]:
df.dropna(inplace=True)

In [None]:
df.duplicated().sum()

48

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

(16353, 2)

**Preprocessing**

    1.   Lower Casing
    2.   Removal of Punctuations
    3.   Lemmatization



In [None]:
import spacy
# !python -m spacy download en_core_web_lg
nlp = spacy.load("en_core_web_lg")

def preprocess(text):
  text = text.lower()               # lower casing
  text = re.sub(r"<.*?>","",text)   # remove html tags
  text = re.sub(r'https?://\S+|www\.\S+|\d+',"",text)   # removing urls and numbers

  text = re.sub(r"\s+'","'",text)

  doc = nlp(text)
  filtered_tokens = []
  for token in doc:
      if token.is_punct:
          continue
      filtered_tokens.append(token.lemma_)
  return " ".join(filtered_tokens)

In [None]:
doc = nlp(re.sub(r"\s+'","'",preprocess(df['answer'][0])))
filtered_tokens=[]
for token in doc:
      if token.is_punct:
          continue
      filtered_tokens.append(token.lemma_)
" ".join(filtered_tokens)
filtered_tokens

['glaucoma',
 'be',
 'a',
 'group',
 'of',
 'disease',
 'that',
 'can',
 'damage',
 'the',
 'eye',
 "'s",
 'optic',
 'nerve',
 'and',
 'result',
 'in',
 'vision',
 'loss',
 'and',
 'blindness',
 'the',
 'most',
 'common',
 'form',
 'of',
 'the',
 'disease',
 'be',
 'open',
 'angle',
 'glaucoma',
 'with',
 'early',
 'treatment',
 'you',
 'can',
 'often',
 'protect',
 'your',
 'eye',
 'against',
 'serious',
 'vision',
 'loss',
 'watch',
 'the',
 'video',
 'to',
 'learn',
 'more',
 'about',
 'glaucoma',
 'to',
 'enlarge',
 'the',
 'video',
 'click',
 'the',
 'bracket',
 'in',
 'the',
 'low',
 'right',
 'hand',
 'corner',
 'to',
 'reduce',
 'the',
 'video',
 'press',
 'the',
 'escape',
 'esc',
 'button',
 'on',
 'your',
 'keyboard',
 'see',
 'this',
 'graphic',
 'for',
 'a',
 'quick',
 'overview',
 'of',
 'glaucoma',
 'include',
 'how',
 'many',
 'people',
 'it',
 'affect',
 'who',
 's',
 'at',
 'risk',
 'what',
 'to',
 'do',
 'if',
 'you',
 'have',
 'it',
 'and',
 'how',
 'to',
 'learn',


In [None]:
df["preprocessed_question"] = df['question'].apply(preprocess)
df["preprocessed_answer"] = df['answer'].apply(preprocess)

In [None]:
df.head(3)

Unnamed: 0,question,answer,preprocessed_question,preprocessed_answer
0,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,what be be glaucoma,glaucoma be a group of disease that can damage...
1,What is (are) Glaucoma ?,The optic nerve is a bundle of more than 1 mil...,what be be glaucoma,the optic nerve be a bundle of more than milli...
2,What is (are) Glaucoma ?,Open-angle glaucoma is the most common form of...,what be be glaucoma,open angle glaucoma be the most common form of...


In [None]:
df[df['preprocessed_question'].duplicated()]

Unnamed: 0,question,answer,preprocessed_question,preprocessed_answer
1,What is (are) Glaucoma ?,The optic nerve is a bundle of more than 1 mil...,what be be glaucoma,the optic nerve be a bundle of more than milli...
2,What is (are) Glaucoma ?,Open-angle glaucoma is the most common form of...,what be be glaucoma,open angle glaucoma be the most common form of...
8,Who is at risk for Glaucoma? ?,Encourage them to have a comprehensive dilated...,who be at risk for glaucoma,encourage they to have a comprehensive dilated...
9,What is (are) Glaucoma ?,National Eye Institute National Institutes of...,what be be glaucoma,national eye institute national institutes of ...
15,What is (are) High Blood Pressure ?,Blood pressure is the force of blood pushing a...,what be be high blood pressure,blood pressure be the force of blood push agai...
...,...,...,...,...
16399,What is (are) Diabetic Neuropathies: The Nerve...,Diabetic neuropathy can be classified as perip...,what be be diabetic neuropathy the nerve damag...,diabetic neuropathy can be classify as periphe...
16400,What is (are) Diabetic Neuropathies: The Nerve...,"Peripheral neuropathy, also called distal symm...",what be be diabetic neuropathy the nerve damag...,peripheral neuropathy also call distal symmetr...
16401,What is (are) Diabetic Neuropathies: The Nerve...,Autonomic neuropathy affects the nerves that c...,what be be diabetic neuropathy the nerve damag...,autonomic neuropathy affect the nerve that con...
16402,What is (are) Diabetic Neuropathies: The Nerve...,"Proximal neuropathy, sometimes called lumbosac...",what be be diabetic neuropathy the nerve damag...,proximal neuropathy sometimes call lumbosacral...


In [None]:
df.shape

(16353, 4)

In [None]:
df.to_csv("/content/drive/MyDrive/Supportiv/preprocessed.csv",index = False)