In [1]:
import pandas as pd

In [2]:
import json

In [3]:
import re

In [4]:
import os

In [5]:
import xlsxwriter

In [6]:
df = pd.read_csv('Testkits2.csv')

In [7]:
df

Unnamed: 0,Title,JSON Response
0,Django,"{\n ""id"": 1250569,\n ""name"": ""Django"",\n..."
1,Coding: Intermediate level,"{\n ""id"": 1471468,\n ""name"": ""Coding: In..."
2,Docker,"{\n ""id"": 170977,\n ""name"": ""Docker"",\n ..."
3,ASP.NET,"{\n ""id"": 895134,\n ""name"": ""ASP.NET (in..."
4,Creating REST APIs,"{\n ""id"": 1950355,\n ""name"": ""Creating R..."
...,...,...
218,NetSuite Administration,"{\n ""id"": 1258902,\n ""name"": ""NetSuite A..."
219,Advanced Networking in Amazon Web Services (AWS),"{\n ""id"": 1431343,\n ""name"": ""Advanced N..."
220,Linux,"{\n ""id"": 925544,\n ""name"": ""Linux"",\n ..."
221,Wireshark,"{\n ""id"": 996346,\n ""name"": ""Wireshark"",..."


In [8]:
df.isnull().sum()

Title            4
JSON Response    5
dtype: int64

In [9]:
df.dropna(inplace=True)

In [10]:
df.isnull().sum()

Title            0
JSON Response    0
dtype: int64

In [11]:
# function to clean tags
def cleanText(text):
    text = re.sub(r"<.*?>",'',str(text)).replace('&nbsp',' ').replace(';','').strip()
    return text

In [12]:
# function to extract data from json

def extract_data(json_str):
    try:
        parsed = json.loads(json_str)
        question_set = []

        for question in parsed.get('preview_questions',[]):
            title = cleanText(question.get('text',''))
            ques_type = None
            options = []
            score = None

            if question.get('type') == 'multiple-choice':
                ques_type = 2
                options = [answer.get('text') for answer in question.get('answers',[])]
                score = scorecheck(question)
            elif question.get('type') == 'multiple-response':
                ques_type = 15
                options = [answer.get('text') for answer in question.get('answers',[])]
                score = scorecheck(question)
            elif question.get('type') == 'truefalse':
                ques_type = 11
                options = ['true', 'false']
                score = scorecheck(question)
            
            question_set.append(
                {
                    'title':title,
                    'ques_type':ques_type,
                    'options':options,
                    'score':score
                }
            )

        return question_set

    except (json.JSONDecodeError, TypeError) as e:
        print(f"Error: {e}")
        return []

In [13]:
def scorecheck(parsed):
    try:    
        score = [] if parsed.get('type') == 'multiple-response' else None 
        for answer in parsed.get('answers',[]):
            if isinstance(score,list):
                if answer.get('score',0)>0:
                    score.append(cleanText(answer.get('text')))
            else:
                if answer.get('score',0)>0:   
                    score = cleanText(answer.get('text'))
                    break
    
        return score if score else None
    except (json.JSONDecodeError, TypeError):
        return None


In [14]:
# to check if json is valid or not

def isJsonValid(json_str):
    try:
        json.loads(json_str)
        return True
    except (json.JSONDecodeError, TypeError):
        return False

In [15]:
df = df[df['JSON Response'].apply(isJsonValid)]

In [16]:
df.shape

(217, 2)

In [17]:
df.isnull().sum()

Title            0
JSON Response    0
dtype: int64

In [18]:
df['Questions'] = df['JSON Response'].apply(lambda x: extract_data(x))

In [19]:
df.head()

Unnamed: 0,Title,JSON Response,Questions
0,Django,"{\n ""id"": 1250569,\n ""name"": ""Django"",\n...",[{'title': 'You are working on a Django projec...
1,Coding: Intermediate level,"{\n ""id"": 1471468,\n ""name"": ""Coding: In...",[{'title': 'The MEX number of a non-negative s...
2,Docker,"{\n ""id"": 170977,\n ""name"": ""Docker"",\n ...","[{'title': 'Given the Dockerfile below, what i..."
3,ASP.NET,"{\n ""id"": 895134,\n ""name"": ""ASP.NET (in...",[{'title': 'Which code snippet can you use to ...
4,Creating REST APIs,"{\n ""id"": 1950355,\n ""name"": ""Creating R...",[{'title': 'You're building a high-throughput ...


In [20]:
[["score=============> "+i['score']+"\n\n",i['title']] for i in df['Questions'].loc[6]]

  'Alice wants to securely send a confidential document to Bob via email. What steps should Alice take to ensure that the document is exchanged securely, and how should Bob decrypt and verify the document?'],
  'Your insurance company keeps customer data safe by storing it in encrypted form using AES-256 bit encryption. Now you need to send your employees the secret AES key to allow them to decrypt customer data. How should you send your employees the secret AES key?'],
  'Each employee in your company has a unique public/private key pair using RSA to securely exchange documents. Now you want to incorporate digital signatures into the documents your employees send. True or false: You will have to generate a new set of public/private key pairs for each of your employees.'],
  'You receive a document along with its hash pasted on the back of the document over a trusted, secure channel. You calculate the hash of the document you received, and it matches the hash on the back of the documen

In [21]:
df.isnull().sum()

Title            0
JSON Response    0
Questions        0
dtype: int64

In [22]:
df.shape

(217, 3)

In [23]:
# function to extract keys from json
def extractFromJson(json_str,key,subkey):
    try:
        parsed = json.loads(json_str)
        value = parsed.get(key,None)

        if isinstance(value,list) and len(value)>0 and isinstance(value[0],dict):
            return value[0].get(subkey,None) if subkey else value[0]
        return value
    except (json.JSONDecodeError,TypeError) as e:
        print(f"Error: {e}")
        return None

In [24]:
df['Type'] = df['JSON Response'].apply(lambda x: extractFromJson(x,'type','name'))

In [25]:
df.head()

Unnamed: 0,Title,JSON Response,Questions,Type
0,Django,"{\n ""id"": 1250569,\n ""name"": ""Django"",\n...",[{'title': 'You are working on a Django projec...,Programming skills
1,Coding: Intermediate level,"{\n ""id"": 1471468,\n ""name"": ""Coding: In...",[{'title': 'The MEX number of a non-negative s...,Programming skills
2,Docker,"{\n ""id"": 170977,\n ""name"": ""Docker"",\n ...","[{'title': 'Given the Dockerfile below, what i...",Software skills
3,ASP.NET,"{\n ""id"": 895134,\n ""name"": ""ASP.NET (in...",[{'title': 'Which code snippet can you use to ...,Programming skills
4,Creating REST APIs,"{\n ""id"": 1950355,\n ""name"": ""Creating R...",[{'title': 'You're building a high-throughput ...,Programming skills


In [26]:
df['Type'].unique()

array(['Programming skills', 'Software skills', 'Role-specific skills',
       'Cognitive ability', 'Situational judgment'], dtype=object)

In [27]:
type(df['Questions'].loc[0])

list

In [35]:
main_folder = 'Question_Set2'
os.makedirs(main_folder, exist_ok=True)

for _,row in df.iterrows():
    type_folder = os.path.join(main_folder,str(row['Type']))
    os.makedirs(type_folder, exist_ok=True)

    file_name = re.sub(r'[\\/*?:"<>|]', "", row['Title']) + ".xlsx"
    file_path = os.path.join(type_folder,file_name)

    questions_data = "questions = "+ cleanText(json.dumps(row['Questions'],indent=4))
    questions_df = pd.DataFrame([questions_data])
    questions_df.reset_index(drop=True, inplace=True)

    with pd.ExcelWriter(file_path, engine='xlsxwriter') as writer:
        questions_df.to_excel(writer, index=False, header=False, sheet_name="Questions")
            