# **Libraries**

In [None]:
import pandas as pd
import numpy as np
import json
import string
import gdown
import os
import re

# **Downloading File**

In [None]:
file_id = 'FILE_ID'
url = f'https://drive.google.com/uc?export=download&id={file_id}'
gdown.download(url, quiet=False)

In [None]:
df = pd.read_json("data 2.json")

# **Preprocessing**

## **Dividing into parts**

In [None]:
part1_df = df[df.model == "api.part1question"]
part2_df = df[df.model == "api.part2question"]
part3_df = df[df.model == "api.part3question"]

part1_answers_df = df[df.model == "school_api.schoolpart1result"]
part2_answers_df = df[df.model == "school_api.schoolpart2result"]
part3_answers_df = df[df.model == "school_api.schoolpart3result"]

parsed_df = df[df.model == "school_api.parsedsession"]

## **Splitting important parts**

In [None]:
def splitter(df):
    df_expanded = pd.concat([df.drop(columns=['fields']), df['fields'].apply(pd.Series)], axis=1)
    df_expanded = df_expanded[['pk', 'question_txt']]
    df_expanded = df_expanded.reset_index(drop=True)

    return df_expanded

In [None]:
def splitter_answers(df):
    df_expanded = pd.concat([df.drop(columns=['fields']), df['fields'].apply(pd.Series)], axis=1)
    df_expanded = df_expanded[["session", 'answer',  'question']]
    df_expanded = df_expanded.dropna(subset=['answer'])
    df_expanded = df_expanded.reset_index(drop=True)

    return df_expanded

In [None]:
def splitter_parsed(df):
    df_expanded = pd.concat([df.drop(columns=['fields']), df['fields'].apply(pd.Series)], axis=1)
    df_expanded = df_expanded[['session', 'raw_json', "parsed_json", "feedback", "band_score"]]
    df_expanded = df_expanded.reset_index(drop=True)

    return df_expanded

In [None]:
df_part1_ques_df = splitter(part1_df)
df_part2_ques_df = splitter(part2_df)
df_part3_ques_df = splitter(part3_df)

df_part1_ans_df = splitter_answers(part1_answers_df)
df_part2_ans_df = splitter_answers(part2_answers_df)
df_part3_ans_df = splitter_answers(part3_answers_df)

df_parsed = splitter_parsed(parsed_df)

## **Merging Datas**

In [None]:
def merged_df(df1, df2):
    merged_df = pd.merge(df1, df2, left_on='pk', right_on='question', how='inner')

    return merged_df

merged_df_1 = merged_df(df_part1_ques_df, df_part1_ans_df)
merged_df_2 = merged_df(df_part2_ques_df, df_part2_ans_df)
merged_df_3 = merged_df(df_part3_ques_df, df_part3_ans_df)

In [None]:
final_ques_ans = pd.concat([merged_df_1, merged_df_2, merged_df_3], axis=0, ignore_index=True)

# **Creating .txt files to save question and answer pairs in good format**

In [None]:
output_directory = 'data_modified_new'
os.makedirs(output_directory, exist_ok=True)

def format_output(data, part_number):
    output = ""
    output += f"----------------- part_{part_number} ------------------\n"
    for index, row in data.iterrows():
        questions = row['question_txt']
        answer = row['answer']
        output += f"- examiner: {questions}.\n candidate: {answer}\n"

    return output

# Process merged_df_1
for session, data in merged_df_1.groupby('session'):
    formatted_output = format_output(data, 1)
    with open(os.path.join(output_directory, f'session_{session}_part1.txt'), 'w') as file:
        file.write(formatted_output)

# Process merged_df_2
for session, data in merged_df_2.groupby('session'):
    formatted_output = format_output(data, 2)
    with open(os.path.join(output_directory, f'session_{session}_part2.txt'), 'w') as file:
        file.write(formatted_output)

# Process merged_df_3
for session, data in merged_df_3.groupby('session'):
    formatted_output = format_output(data, 3)
    with open(os.path.join(output_directory, f'session_{session}_part3.txt'), 'w') as file:
        file.write(formatted_output)

print("Processing complete. Files saved in the 'data_modified' directory.")


Processing complete. Files saved in the 'data_modified' directory.


In [None]:
output_directory = 'combined_parts'
os.makedirs(output_directory, exist_ok=True)

for session in merged_df_1['session'].unique():
  part1_file = os.path.join('data_modified_new', f'session_{session}_part1.txt')
  part2_file = os.path.join('data_modified_new', f'session_{session}_part2.txt')
  part3_file = os.path.join('data_modified_new', f'session_{session}_part3.txt')
  combined_file = os.path.join(output_directory, f'session_{session}.txt')

  with open(combined_file, 'w') as outfile:
    if os.path.exists(part1_file):
      with open(part1_file, 'r') as infile:
        outfile.write(infile.read())
    if os.path.exists(part2_file):
      with open(part2_file, 'r') as infile:
        outfile.write(infile.read())
    if os.path.exists(part3_file):
      with open(part3_file, 'r') as infile:
        outfile.write(infile.read())

print("Combined files saved in the 'combined_parts' directory.")


Combined files saved in the 'combined_parts' directory.


## **Downloading zip file**

In [None]:
from google.colab import files
import shutil

shutil.make_archive('/content/data_new_filtered', 'zip', '/content/data_new_filtered')

files.download('data_new_filtered.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
directory = '/content/combined_parts'

## **Woring with .txt file**

In [None]:
data = []
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        session_number = ''.join(filter(str.isdigit, filename))
        with open(os.path.join(directory, filename), 'r') as file:
            content = file.read()
            data.append({'session': session_number, 'content': content})

df_txt = pd.DataFrame(data)

In [None]:
df_txt['session'] = df_txt['session'].astype('int64')

In [None]:
merged_df_full = pd.merge(df_txt, df_parsed, on='session', how='left')

In [None]:
output_directory = 'data_new'
os.makedirs(output_directory, exist_ok=True)

for index, row in merged_df_full.iterrows():
    session_number = row['session']
    content = row['transcript']
    filename = f'session_{session_number}.txt'
    filepath = os.path.join(output_directory, filename)
    with open(filepath, 'w') as file:
        file.write(content)

print("Transcripts saved to individual files in the 'data_new' directory.")

Transcripts saved to individual files in the 'data_new' directory.


In [None]:
merged_df_full = merged_df_full.rename(columns={'content': 'transcript', 'raw_json': 'feedback_gpt', "band_score": "band_score_gpt"})
merged_df_full.drop(columns=['parsed_json', 'feedback'], inplace=True)

In [None]:
merged_df_full.drop(columns=['band_score_gpt'], inplace=True)

In [None]:
merged_df_full.dropna(inplace=True)

In [None]:
idx = merged_df_full.groupby('session')['feedback_gpt'].apply(lambda x: x.str.len().idxmax())

merged_df_full = merged_df_full.loc[idx]

In [None]:
filtered_df = df_txt[df_txt['content'].str.contains('part_1') &
                     df_txt['content'].str.contains('part_2') &
                     df_txt['content'].str.contains('part_3')]

print("Number of rows before filtering:", len(df_txt))
print("Number of rows after filtering:", len(filtered_df))

df_txt = filtered_df

Number of rows before filtering: 10943
Number of rows after filtering: 9518


In [None]:
output_directory = 'data_new_filtered'
os.makedirs(output_directory, exist_ok=True)

for index, row in df_txt.iterrows():
    session_number = row['session']
    content = row['content']
    filename = f'session_{session_number}.txt'
    filepath = os.path.join(output_directory, filename)
    with open(filepath, 'w') as file:
        file.write(content)

print("Transcripts saved to individual files in the 'data_new_filtered' directory.")

Transcripts saved to individual files in the 'data_new_filtered' directory.


In [None]:
!ls data_2 | wc -l

2000


# **Dividing Dataset into parts**

In [None]:
import os
import shutil

data_folder = 'data_new_filtered'

all_files = sorted(os.listdir(data_folder))

files_per_folder = 2000

for i in range(4):
    start_index = i * files_per_folder
    end_index = start_index + files_per_folder

    if i == 3:
        end_index = len(all_files)

    new_folder = os.path.join(data_folder, f'folder_{i+1}')
    os.makedirs(new_folder, exist_ok=True)

    for file in all_files[start_index:end_index]:
        shutil.move(os.path.join(data_folder, file), os.path.join(new_folder, file))

print("Files have been successfully distributed into 4 folders.")

Files divided into folders.
