### Import data

In [3]:
import json
import pandas as pd

# Read the JSON file line by line
data = []
with open('/Users/bytedance/Desktop/medmcqa/train.json', 'r', encoding='utf-8') as file:
    for line in file:
        if line.strip():  # Skip empty lines
            try:
                json_obj = json.loads(line.strip())
                # Extract only the specified columns
                extracted_data = {
                    'question': json_obj.get('question', ''),
                    'exp': json_obj.get('exp', ''),
                    'cop': json_obj.get('cop', ''),
                    'opa': json_obj.get('opa', ''),
                    'opb': json_obj.get('opb', ''),
                    'opc': json_obj.get('opc', ''),
                    'opd': json_obj.get('opd', '')
                }
                data.append(extracted_data)
            except json.JSONDecodeError as e:
                print(f"Error parsing line: {line[:50]}... - {e}")
                continue

# Create DataFrame
df = pd.DataFrame(data)

# Display basic info about the dataframe
print(f"DataFrame created with {len(df)} rows and {len(df.columns)} columns")
print(f"Columns: {list(df.columns)}")
print("\nFirst 3 rows:")
print(df.head(3))

DataFrame created with 182822 rows and 7 columns
Columns: ['question', 'exp', 'cop', 'opa', 'opb', 'opc', 'opd']

First 3 rows:
                                            question  \
0  Chronic urethral obstruction due to benign pri...   
1  Which vitamin is supplied from only animal sou...   
2  All of the following are surgical options for ...   

                                                 exp  cop  \
0  Chronic urethral obstruction because of urinar...    3   
1  Ans. (c) Vitamin B12 Ref: Harrison's 19th ed. ...    3   
2  Ans. is 'd' i.e., Roux en Y Duodenal Bypass Ba...    4   

                          opa                        opb              opc  \
0                 Hyperplasia                  Hyperophy          Atrophy   
1                   Vitamin C                 Vitamin B7      Vitamin B12   
2  Adjustable gastric banding  Biliopancreatic diversion  Duodenal Switch   

                          opd  
0                    Dyplasia  
1                   Vitamin D

### Map and get the correct answer based on the cop column

In [9]:
# df['cop'].value_counts()
cop_mapping = {
    1: 'opa',
    2: 'opb', 
    3: 'opc',
    4: 'opd'
}

# Create correct_answer column by mapping cop to the corresponding option column
df['correct_answer'] = df.apply(lambda row: row[cop_mapping.get(row['cop'], 'opa')], axis=1)

# Display results
print("Sample of the new correct_answer column:")
df.head()

Sample of the new correct_answer column:


Unnamed: 0,question,exp,cop,opa,opb,opc,opd,correct_answer
0,Chronic urethral obstruction due to benign pri...,Chronic urethral obstruction because of urinar...,3,Hyperplasia,Hyperophy,Atrophy,Dyplasia,Atrophy
1,Which vitamin is supplied from only animal sou...,Ans. (c) Vitamin B12 Ref: Harrison's 19th ed. ...,3,Vitamin C,Vitamin B7,Vitamin B12,Vitamin D,Vitamin B12
2,All of the following are surgical options for ...,"Ans. is 'd' i.e., Roux en Y Duodenal Bypass Ba...",4,Adjustable gastric banding,Biliopancreatic diversion,Duodenal Switch,Roux en Y Duodenal By pass,Roux en Y Duodenal By pass
3,Following endaerectomy on the right common car...,The central aery of the retina is a branch of ...,1,Central aery of the retina,Infraorbital aery,Lacrimal aery,Nasociliary aretry,Central aery of the retina
4,Growth hormone has its effect on growth through?,"Ans. is 'b' i.e., IGI-1GH has two major functi...",2,Directly,IG1-1,Thyroxine,Intranuclear receptors,IG1-1


In [10]:
import os

# Define the base filename
base_filename = 'extracted_medmcqa'
base_path = '/Users/bytedance/Desktop/medmcqa/'

# Save as CSV
csv_path = os.path.join(base_path, f'{base_filename}.csv')
df.to_csv(csv_path, index=False)
print(f"✅ Saved as CSV: {csv_path}")

✅ Saved as CSV: /Users/bytedance/Desktop/medmcqa/extracted_medmcqa.csv
