In [1]:
import pandas as pd
import numpy as np
import json
import re

In [2]:
df = pd.read_csv('Ashutosh.csv')

In [3]:
df.head()

Unnamed: 0,Title,JSON Response
0,Django,"{\n ""id"": 1250569,\n ""name"": ""Django"",\n..."
1,Coding: Intermediate level,"{\n ""id"": 1471468,\n ""name"": ""Coding: In..."
2,Docker,"{\n ""id"": 170977,\n ""name"": ""Docker"",\n ..."
3,ASP.NET,"{\n ""id"": 895134,\n ""name"": ""ASP.NET (in..."
4,Creating REST APIs,"{\n ""id"": 1950355,\n ""name"": ""Creating R..."


In [4]:
df.shape

(223, 2)

In [5]:
df.isnull().sum()

Title            4
JSON Response    5
dtype: int64

In [6]:
df.dropna(inplace=True)

In [7]:
df.columns

Index(['Title', 'JSON Response'], dtype='object')

In [8]:
df.isnull().sum()

Title            0
JSON Response    0
dtype: int64

In [9]:
df.head()

Unnamed: 0,Title,JSON Response
0,Django,"{\n ""id"": 1250569,\n ""name"": ""Django"",\n..."
1,Coding: Intermediate level,"{\n ""id"": 1471468,\n ""name"": ""Coding: In..."
2,Docker,"{\n ""id"": 170977,\n ""name"": ""Docker"",\n ..."
3,ASP.NET,"{\n ""id"": 895134,\n ""name"": ""ASP.NET (in..."
4,Creating REST APIs,"{\n ""id"": 1950355,\n ""name"": ""Creating R..."


In [10]:
df.rename(columns={df.columns[0]: 'Title'}, inplace=True)
if 'JSON response' not in df.columns:
    df.rename(columns={df.columns[1]: 'json_response'}, inplace=True)
df.dropna(axis=1, how='all', inplace=True)
print(df.head())

                        Title  \
0                      Django   
1  Coding: Intermediate level   
2                      Docker   
3                     ASP.NET   
4          Creating REST APIs   

                                       json_response  
0  {\n    "id": 1250569,\n    "name": "Django",\n...  
1  {\n    "id": 1471468,\n    "name": "Coding: In...  
2  {\n    "id": 170977,\n    "name": "Docker",\n ...  
3  {\n    "id": 895134,\n    "name": "ASP.NET (in...  
4  {\n    "id": 1950355,\n    "name": "Creating R...  


In [11]:
def is_valid_json(json_str):
    try:
        json.loads(json_str)
        return True
    except (json.JSONDecodeError, TypeError):
        return False

df = df[df['json_response'].apply(is_valid_json)].reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217 entries, 0 to 216
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Title          217 non-null    object
 1   json_response  217 non-null    object
dtypes: object(2)
memory usage: 3.5+ KB


In [12]:
df.head()

Unnamed: 0,Title,json_response
0,Django,"{\n ""id"": 1250569,\n ""name"": ""Django"",\n..."
1,Coding: Intermediate level,"{\n ""id"": 1471468,\n ""name"": ""Coding: In..."
2,Docker,"{\n ""id"": 170977,\n ""name"": ""Docker"",\n ..."
3,ASP.NET,"{\n ""id"": 895134,\n ""name"": ""ASP.NET (in..."
4,Creating REST APIs,"{\n ""id"": 1950355,\n ""name"": ""Creating R..."


In [13]:
def extract_type_name(json_str):
    try:
        data = json.loads(json_str)
        type_list = data.get("type", [])
        if isinstance(type_list, list) and len(type_list) > 0:
            return type_list[0].get("name", None)
        return None
    except (json.JSONDecodeError, TypeError, AttributeError):
        return None
df["type"] = df["json_response"].apply(extract_type_name)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217 entries, 0 to 216
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Title          217 non-null    object
 1   json_response  217 non-null    object
 2   type           217 non-null    object
dtypes: object(3)
memory usage: 5.2+ KB


In [14]:
df.head()

Unnamed: 0,Title,json_response,type
0,Django,"{\n ""id"": 1250569,\n ""name"": ""Django"",\n...",Programming skills
1,Coding: Intermediate level,"{\n ""id"": 1471468,\n ""name"": ""Coding: In...",Programming skills
2,Docker,"{\n ""id"": 170977,\n ""name"": ""Docker"",\n ...",Software skills
3,ASP.NET,"{\n ""id"": 895134,\n ""name"": ""ASP.NET (in...",Programming skills
4,Creating REST APIs,"{\n ""id"": 1950355,\n ""name"": ""Creating R...",Programming skills


In [15]:
def extract_questions(json_str):
    try:
        data = json.loads(json_str)  # Convert string to dictionary
        questions_list = data.get("preview_questions", [])  # Get "preview_questions" list
        return questions_list if isinstance(questions_list, list) else []  # Ensure it's a list
    except (json.JSONDecodeError, TypeError, AttributeError):
        return []

df["questions"] = df["json_response"].apply(extract_questions)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217 entries, 0 to 216
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Title          217 non-null    object
 1   json_response  217 non-null    object
 2   type           217 non-null    object
 3   questions      217 non-null    object
dtypes: object(4)
memory usage: 6.9+ KB


In [16]:
df["questions"].head()

0    [{'id': 1095950, 'text': '<p>You are working o...
1    [{'id': 1278402, 'text': '<p><span style="back...
2    [{'id': 155197, 'text': '<p>Given the Dockerfi...
3    [{'id': 615710, 'text': '<p>Which code snippet...
4    [{'id': 1561619, 'text': '<p><span style="back...
Name: questions, dtype: object

In [17]:
print(df['questions'].iloc[0])

[{'id': 1095950, 'text': '<p>You are working on a Django project for a company that specializes in real estate. You need to choose a field type for the company’s model for storing property information that allows users to enter a price in US dollars. However, the company also operates in other countries and wants to support other currencies.</p><p>&nbsp;</p><p><strong style="color: rgb(0, 0, 0);">Which of the following is the best approach for choosing this field?</strong></p>', 'intro_text': None, 'type': 'multiple-choice', 'shuffle': True, 'answers': [{'id': 9252450, 'text': '<p>Create a field that stores the price as a <code><strong>DecimalField</strong></code> and the currency code as a <code><strong>CharField</strong></code>.</p>', 'score': 5, 'rich_text': False}, {'id': 9252451, 'text': '<p>Create a <code><strong>DecimalField</strong></code> and allow users to enter the currency code as a separate field.</p>', 'score': 0, 'rich_text': False}, {'id': 9252452, 'text': "<p>Use Djang

In [18]:
def clean_html(text):
    clean_text = a.sub(r'<.*?>', '', text) 
    clean_text = clean_text.replace('&nbsp;', ' ')
    return clean_text.strip()


distinct_question_types = {}

for question_list in df['questions']:
    for question in question_list:
        question_type = None
        correct_answers = []

        if question['type'] == 'multiple-choice':
            question_type = 2
            for answer in question['answers']:
                if answer['score'] >= 5:
                    correct_answers.append(clean_html(answer['text']))
        
        elif question['type'] == 'true-false':
            question_type = 11
            for answer in question['answers']:
                if answer['score'] >= 5:
                    correct_answers.append(clean_html(answer['text']))
        
        elif question['type'] == 'multiple-choice':
            question_type = 15
            for answer in question['answers']:
                if answer['score'] == 5:
                    correct_answers.append(clean_html(answer['text']))
                    
        if question_type not in [2, 11, 15]:
            continue
        
        distinct_question_types[question['id']] = {
            "ques_type": question_type,
            "correct_answers": correct_answers
        }

print(json.dumps(distinct_question_types, indent=2))


{
  "1095950": {
    "ques_type": 2,
    "correct_answers": [
      "Create a field that stores the price as a DecimalField and the currency code as a CharField."
    ]
  },
  "1095951": {
    "ques_type": 2,
    "correct_answers": [
      "self.assertTemplateUsed(response, 'core/posts.html')"
    ]
  },
  "1095952": {
    "ques_type": 2,
    "correct_answers": [
      "Define the form as a subclass of forms.ModelForm with name, email, subject, and message in the fields array."
    ]
  },
  "1095953": {
    "ques_type": 2,
    "correct_answers": [
      "Apply the ```title``` HTML template filter, e.g., {{ value | title }}"
    ]
  },
  "155197": {
    "ques_type": 2,
    "correct_answers": [
      "Combine the RUN commands separated by &amp;&amp;."
    ]
  },
  "155200": {
    "ques_type": 2,
    "correct_answers": [
      "echo $PASSWORD | docker login registry.company.com --username $USERNAME --password-stdin"
    ]
  },
  "155206": {
    "ques_type": 2,
    "correct_answers": [
   

In [19]:
# df["questions"] = df["questions"].apply(lambda x: x if isinstance(x, list) else [])
# question_types = (
#     df["questions"]
#     .explode()
#     .dropna()
#     .apply(lambda q: q.get("type") if isinstance(q, dict) else None)
#     .dropna()
#     .unique()
# )
# print("Distinct question types:", question_types)

In [20]:
# def filter_questions(questions):
#     """Filter questions and assign appropriate ques_type."""
#     valid_types = {
#         "multiple-choice": 2,  # MCQ (Single correct)
#         "true-false": 11,      # True/False
#         "multiple-select": 15  # MCQ (Multiple correct)
#     }

#     cleaned_questions = []
    
#     for q in questions:
#         if not isinstance(q, dict):
#             continue  # Skip invalid entries
        
#         q_type = q.get("type", None)
#         ques_type = valid_types.get(q_type, None)  # Get ques_type or None
        
#         if ques_type is not None:
#             cleaned_question = {
#                 "id": q.get("id", None),
#                 "text": clean_html(q.get("text", "")),  # Clean HTML
#                 "ques_type": ques_type,
#                 "answers": q.get("answers", [])
#             }
#             cleaned_questions.append(cleaned_question)
    
#     return cleaned_questions
# df.head()