## Cleaning Jee QD (Question Dataset)

In [None]:
!pip install html2text
import pandas as pd
import matplotlib.pyplot as plt
from time import time
import re
import html2text

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##### Loading data

In [None]:
df = pd.read_csv("drive/MyDrive/Literature Review/question_master.csv")

In [None]:
# df = df[:10000]
# Comment when working with full data

##### Removing rows where 'question_data' row is NaN

In [1]:
df

NameError: ignored

In [None]:
print("Len before:", len(df))
df = df[df['question_data'].notna()]
print("Len after:", len(df))

Len before: 10000
Len after: 10000


##### Extracting questions, the corresponding marks (maybe useful, but most probably not),  and the syllabus_id

In [None]:
def str_to_dict(s):
  false = 1
  true = 1
  return eval(s[1:len(s)-1])

Following cell takes about 2m 20s to run

In [None]:
t = time()
texts = []
syllabus_ids = []
unknown_syllabus = 0;
for index in range(len(df['question_data'])):
  i = df.iloc[index]
  if index % 40000 == 0 and index > 0:
    print("Index:",index)
  text = str_to_dict(i['question_data'])
  syllabus_id = 0
  try:
    text = text['question_text']
  except:
    text = text[0]['question_text']
  
  try:
      syllabus_id = str_to_dict(i['repository_syllabus_id'])
  except:
    syllabus_id = -1
    unknown_syllabus+=1
  texts.append(text)
  syllabus_ids.append(syllabus_id)
                      
print("Time taken: " , int((time() - t)%60) ,"s, ", int((time() - t)/60), "m")
print("Syllabus ID unknown for ", unknown_syllabus, " questions")

Index: 40000
Index: 80000
Index: 120000
Index: 160000
Index: 200000
Index: 240000
Time taken:  38 s,  1 m
Syllabus ID unknown for  273  questions


##### Text is cleaned from the html tags, non-useful symbols but the questions remain in sentences only

In [None]:
def formatToString(q):
  punctuation_marks= ['-', '?', '!', '.', ',', ':', ';']
  tags = ["\n", "\t", " "]
  q = (html2text.html2text(q))
  res=[]
  split = re.findall(r"[\w']+", q)
  for word in split:
    if(word not in tags and word not in punctuation_marks):
      res.append(word)
  mystr = ""
  for i in res:
    mystr=mystr+i+" "
  return mystr

Following cell takes about 3m to run

In [None]:
t = time()
textsAsString = [formatToString(i) for i in texts]
print("Time taken: " , int((time() - t)%60) ,"s, ", int((time() - t)/60), "m")
print("Texts:\n",textsAsString[0],"\n",textsAsString[1],"\n And so on...")

In [None]:
data = pd.DataFrame(list(zip(textsAsString, syllabus_ids)), columns = ["question", "syllabus_id"])

In [None]:
data.to_csv("drive/My Drive/Literature Review/Dataset Versions/EM-Data-with-answers.csv")

In [None]:
data.head()

##### Texts are tokenized

In [None]:
def formatToStringAndTokenize(q):
  # punctuation_marks= ['-', '?', '!', '.', ',', ':', ';']
  punctuation_marks = []
  tags = ["\n", "\t", " "]
  q = (html2text.html2text(q))
  res=[]
  split = re.findall(r"[\w']+", q)
  for word in split:
    if(word not in tags and word not in punctuation_marks):
      res.append(word)
  return res

It will take around 3 mins to run the following cell

In [None]:
t = time()
texts = [formatToStringAndTokenize(i) for i in texts]
print("Time taken: " , int((time() - t)%60) ,"s, ", int((time() - t)/60), "m")
print("Texts:\n",texts[0],"\n",texts[1],"\n And so on...")

##### Creating a DataFrame from the texts and questions

In [None]:
data = pd.DataFrame(list(zip(texts, syllabus_ids)), columns = ["question", "syllabus_id"])

In [None]:
data.head(3)

In [None]:
data.to_csv("clean_data(tokenized_manual).csv")

###Important notes to mind:
1. All plus, minus, multiply, divide signs are being removed, sicnce we will not be working on equations for beginning (as discussed). 
2. For questions that do not have a repository_syllabus_id defined, -1 is assigned as their syllabus ID

In [None]:

texts[2332]

NameError: ignored

In [None]:
html2text.html2text(texts[2332])

NameError: ignored

In [None]:
df.iloc[2332]['question_data']

'[{"language_id":1,"language_name":"English","question_text":"<p>Which of the following reactions can be used to prepare acetophenone?</p>\\r\\n","ques_instruction":"","question_teacher_description":"<p>All will be used.</p>\\r\\n","instruction_video":[""],"question_video":[""],"explaination_hint_video":[""],"answer_ideal_time":60,"answer_explanation":"","imported_from":{"bool":false,"doc_id":""},"created_by":"22652","created_date":"2022-03-26T07:16:58.000Z","updated_by":"22652","updated_date":"2022-03-26T01:44:16.000Z","question_option":[{"answer_id":5785989,"answer_status":1,"option_image":"","answer_text":"<p><math xmlns=\\"http://www.w3.org/1998/Math/MathML\\"><msub><mi mathvariant=\\"normal\\">C</mi><mn>6</mn></msub><msub><mi mathvariant=\\"normal\\">H</mi><mn>6</mn></msub><mo>+</mo><msub><mi>CH</mi><mn>3</mn></msub><mi>COCl</mi><munderover><mo>&#8594;</mo><mrow><mn>2</mn><mo>.</mo><mtext>&#8196;</mtext><msub><mi mathvariant=\\"normal\\">H</mi><mn>2</mn></msub><mi mathvariant=\\"n

In [None]:
[{"language_id":1,"language_name":"English","question_text":"<p>Which of the following reactions can be used to prepare acetophenone?</p>\r\n","ques_instruction":"","question_teacher_description":"<p>All will be used.</p>\r\n","instruction_video":[""],"question_video":[""],"explaination_hint_video":[""],"answer_ideal_time":60,"answer_explanation":"","imported_from":{"bool":false,"doc_id":""},"created_by":"22652","created_date":"2022-03-26T07:16:58.000Z","updated_by":"22652","updated_date":"2022-03-26T01:44:16.000Z","question_option":[{"answer_id":5785989,"answer_status":1,"option_image":"","answer_text":"<p><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi mathvariant=\"normal\">C</mi><mn>6</mn></msub><msub><mi mathvariant=\"normal\">H</mi><mn>6</mn></msub><mo>+</mo><msub><mi>CH</mi><mn>3</mn></msub><mi>COCl</mi><munderover><mo>&#8594;</mo><mrow><mn>2</mn><mo>.</mo><mtext>&#8196;</mtext><msub><mi mathvariant=\"normal\">H</mi><mn>2</mn></msub><mi mathvariant=\"normal\">O</mi></mrow><mrow><mn>1</mn><mo>.</mo><mtext>&#8196;</mtext><msub><mi>AlCl</mi><mn>3</mn></msub></mrow></munderover></math></p>\r\n","answer_order":1,"explanation_text":"","video_url":"","video_time_lapse":"","is_right":0,"marks":0},{"answer_id":5785990,"answer_status":1,"option_image":"","answer_text":"<p><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mrow><mo>(</mo><msub><mi mathvariant=\"normal\">C</mi><mn>6</mn></msub><msub><mi mathvariant=\"normal\">H</mi><mn>5</mn></msub><mi>COO</mi><mo>)</mo></mrow><mn>2</mn></msub><mi>Ca</mi><mo>+</mo><msub><mrow><mo>(</mo><msub><mi>CH</mi><mn>3</mn></msub><mi>COO</mi><mo>)</mo></mrow><mn>2</mn></msub><mi>Ca</mi><mover><mo>&#8594;</mo><mi>heat</mi></mover></math></p>\r\n","answer_order":2,"explanation_text":"","video_url":"","video_time_lapse":"","is_right":0,"marks":0},{"answer_id":5785991,"answer_status":1,"option_image":"","answer_text":"<p><math xmlns=\"http://www.w3.org/1998/Math/MathML\"><msub><mi mathvariant=\"normal\">C</mi><mn>6</mn></msub><msub><mi mathvariant=\"normal\">H</mi><mn>6</mn></msub><mi>CN</mi><munderover><mo>&#8594;</mo><mrow><mn>2</mn><mo>.</mo><mtext>&#8196;</mtext><msub><mi mathvariant=\"normal\">H</mi><mn>3</mn></msub><msup><mi mathvariant=\"normal\">O</mi><mo>+</mo></msup></mrow><mrow><mn>1</mn><mo>.</mo><mtext>&#8196;</mtext><msub><mi>CH</mi><mn>3</mn></msub><mi>MgI</mi></mrow></munderover></math></p>\r\n","answer_order":3,"explanation_text":"","video_url":"","video_time_lapse":"","is_right":0,"marks":0},{"answer_id":5785992,"answer_status":1,"option_image":"","answer_text":"<p>All of these</p>\r\n","answer_order":4,"explanation_text":"","video_url":"","video_time_lapse":"","is_right":1,"marks":4}]}]

In [None]:
len(df)

276447