In [None]:
import pandas as pd

df = pd.read_csv('filtered_enriched_squad.csv')

missing_data = df[df['question'].isnull() | df['topic'].isnull() | df['text'].isnull()]

print("Rows with missing fields:")
print(missing_data)

print("Number of rows with missing fields:", missing_data.shape[0])


In [None]:
import pandas as pd
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained('t5-base')

df = pd.read_csv('combined_squad.csv')

def count_exceeding_tokens(topic, text, max_length=510):
    combined_text = topic + " " + text  
    token_count = len(tokenizer.encode(combined_text))
    return token_count > max_length


df['exceeds_limit'] = df.apply(lambda row: count_exceeding_tokens(row['topic'], row['text']), axis=1)

exceeding_entries = df[df['exceeds_limit']]
print("Entries exceeding 510 tokens:")
print(exceeding_entries[['question', 'topic', 'text']])
exceeding_count = df['exceeds_limit'].sum()

print(f"Number of entries where 'topic' + 'text' exceed 510 tokens: {exceeding_count}")

df_filtered = df[~df['exceeds_limit']]

df_filtered.drop(columns=['exceeds_limit'], inplace=True)

df_filtered.to_csv('adjusted_combined_squad.csv', index=False)

print(f"Updated dataset saved. Number of entries now: {df_filtered.shape[0]}")


In [None]:
import pandas as pd
import numpy as np
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained('t5-base')

df = pd.read_csv('filtered_enriched_squad.csv')

combined_df = pd.DataFrame(columns=['question', 'topic', 'text', 'context1', 'context2', 'question2', 'topic2'])

np.random.seed(42)

while len(combined_df) < 10001:
    for index, row in df.iterrows():

        if len(combined_df) >= 10001:
            break
       
        other_rows = df[(df.index != index) & (df['text'] != row['text'])]
        if not other_rows.empty:
         
            other_index = np.random.choice(other_rows.index)
            other_row = df.loc[other_index]

            other_text = other_row['text']
            other_question = other_row['question']
            other_topic = other_row['topic']

            combined_text = row['text'] + " " + other_text
            token_count = len(tokenizer.encode(combined_text))

            if token_count < 500:
                new_row = pd.DataFrame({
                    'question': [row['question']],
                    'topic': [row['topic']],
                    'text': [combined_text],
                    'context1': [row['text']],
                    'context2': [other_text],
                    'question2': [other_question],
                    'topic2': [other_topic]
                })
                combined_df = pd.concat([combined_df, new_row], ignore_index=True)

combined_df.to_csv('2newcombined_squad.csv', index=False)

print("Combined dataset created and saved as 'combined_squad.csv'.")
print(f"Total entries in new dataset: {combined_df.shape[0]}")


In [None]:
import pandas as pd

combined_df = pd.read_csv('/home/qiyu/Dev/ziqing/T5/newcombined_squad.csv')

duplicates = combined_df.duplicated(subset=['text'])

print("Number of duplicate rows based on question, topic, and text:", duplicates.shape[0])
if duplicates.shape[0] > 0:
    print(duplicates)
else:
    print("No duplicates found based on question, topic, and text.")


In [None]:
import pandas as pd
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained('t5-base')

df = pd.read_csv('/home/qiyu/Dev/ziqing/T5/2newcombined_squad.csv')

def count_exceeding_tokens(topic, text, max_length=510):
    combined_text = topic + " " + text  
    token_count = len(tokenizer.encode(combined_text))
    return token_count > max_length

df['exceeds_limit'] = df.apply(lambda row: count_exceeding_tokens(row['topic2'], row['text']), axis=1)

exceeding_entries = df[df['exceeds_limit']]
print("Entries exceeding 510 tokens:")
print(exceeding_entries[['question', 'topic2', 'text']])
exceeding_count = df['exceeds_limit'].sum()

print(f"Number of entries where 'topic2' + 'text' exceed 510 tokens: {exceeding_count}")

df_filtered = df[~df['exceeds_limit']]

df_filtered.drop(columns=['exceeds_limit'], inplace=True)

df_filtered.to_csv('adjusted_combined_squad.csv', index=False)

print(f"Updated dataset saved. Number of entries now: {df_filtered.shape[0]}")


In [None]:
import pandas as pd

file_path = 'newcombined_squad.csv'
df = pd.read_csv(file_path)
df_copy = df.copy()
df_copy['text'] = df_copy['context2'] + " " + df_copy['context1']

combined_df = pd.concat([df, df_copy], ignore_index=True)

new_file_path = 'doubled_newcombined_squad.csv'
combined_df.to_csv(new_file_path, index=False)

print(f"Doubled dataset saved as '{new_file_path}'.")
print(f"Total entries in the new dataset: {combined_df.shape[0]}")


In [None]:
import pandas as pd
import numpy as np
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained('google-t5/t5-small')

df = pd.read_csv('/home/qiyu/Dev/ziqing/T5/filtered_enriched_KhanQ.csv')

combined_df = pd.DataFrame(columns=['question', 'topic', 'text', 'question2', 'topic2', 'context1', 'context2'])

np.random.seed(42)

while len(combined_df) < 1000:
    for index, row in df.iterrows():

        if len(combined_df) >= 1000:
            break

        other_indexes = df[df.index != index].index.tolist()

        if len(other_indexes) > 0:
     
            other_index = np.random.choice(other_indexes)
            other_row = df.loc[other_index]

            new_text = row['text'] + " " + other_row['text']

            token_count = len(tokenizer.encode(new_text))

            if token_count < 500:
               
                new_row = {
                    'question': row['question'],
                    'topic': row['topic'],
                    'text': new_text,
                    'question2': other_row['question'],
                    'topic2': other_row['topic'],
                    'context1': row['text'],
                    'context2': other_row['text']
                }
                new_row_df = pd.DataFrame(new_row, index=[0]) 
                combined_df = pd.concat([combined_df, new_row_df], ignore_index=True)  

combined_df.to_csv('unready_combined_KhanQ.csv', index=False)

print("Combined dataset created and saved as 'unready_combined_KhanQ.csv'.")
print(f"Total entries in new dataset: {combined_df.shape[0]}")


In [None]:
import pandas as pd
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained('google-t5/t5-small')

df = pd.read_csv('unready_combined_KhanQ.csv')

def count_exceeding_tokens(topic, text, max_length=510):
    combined_text = topic + " " + text 
    token_count = len(tokenizer.encode(combined_text))
    return token_count > max_length

df['exceeds_limit'] = df.apply(lambda row: count_exceeding_tokens(row['topic'], row['text']), axis=1)

exceeding_entries = df[df['exceeds_limit']]
print("Entries exceeding 510 tokens:")
print(exceeding_entries[['question', 'topic', 'text']])
exceeding_count = df['exceeds_limit'].sum()

print(f"Number of entries where 'topic' + 'text' exceed 510 tokens: {exceeding_count}")

In [None]:
import pandas as pd

random_seed = 42
data = pd.read_csv('unready_combined_KhanQ.csv')
print("original:", data.shape[0])
print(data.head())
sampled_data = data.sample(n=653, random_state=random_seed)
print("after:", sampled_data.shape[0])
print(sampled_data.head())

sampled_data.to_csv('combined_KhanQ.csv', index=False)


In [None]:
import numpy as np
import pandas as pd

np.random.seed(42)

references_path = '/home/qiyu/Dev/ziqing/T5/train/eval_newcsQGSE/T5_newcs_once/references_T5_combinedsquad_once.npy'
predictions_path = '/home/qiyu/Dev/ziqing/T5/train/eval_newcsQGSE/T5_newcs_once/predictions_T5_combinedsquad_once.npy'
predictions_topic2_path = '/home/qiyu/Dev/ziqing/T5/train/eval_newcsQGSE/T5_newcs_once/predictionstopic2_T5_combinedsquad_once.npy'

references = np.load(references_path)
predictions = np.load(predictions_path)
predictions_topic2 = np.load(predictions_topic2_path)

indices = np.random.choice(len(references), 30, replace=False)
selected_references = references[indices]

selected_predictions = predictions[indices]
selected_predictions_topic2 = predictions_topic2[indices]

df = pd.DataFrame({
    'Index': indices,
    'Label Question': selected_references,
    'GQ1': selected_predictions,
    'GQ2': selected_predictions_topic2
})

output_file = 'human_data.csv'
df.to_csv(output_file, index=False)

print(f"Data extracted and saved as '{output_file}'. Total entries: {len(df)}")


In [None]:
import pandas as pd
import numpy as np

file_path = '/home/qiyu/Dev/ziqing/T5/human_data.csv'
df = pd.read_csv(file_path)

np.random.seed(42) 
for idx in df.index:
    if np.random.rand() > 0.5: 
        df.at[idx, 'GQ1'], df.at[idx, 'GQ2'] = df.at[idx, 'GQ2'], df.at[idx, 'GQ1']

df = df.sample(frac=1).reset_index(drop=True)

lines = df.apply(lambda x: f"Index: {x['Index']}\nLabel Question: {x['Label Question']}\nGQ1: {x['GQ1']}\nGQ2: {x['GQ2']}\n", axis=1)

output_file = 'shuffled_human_data.txt'
with open(output_file, 'w') as f:
    f.writelines(lines)

print(f"Survey data prepared and saved as '{output_file}'.")


In [None]:
import pandas as pd
import numpy as np

file_path = '/home/qiyu/Dev/ziqing/T5/human_data.csv'
df = pd.read_csv(file_path)
df['Swapped'] = 0 
np.random.seed(42) 
for idx in df.index:
    if np.random.rand() > 0.5: 
        df.at[idx, 'GQ1'], df.at[idx, 'GQ2'] = df.at[idx, 'GQ2'], df.at[idx, 'GQ1']
        df.at[idx, 'Swapped'] = 1  

df = df.sample(frac=1).reset_index(drop=True)
output_file = 'shuffled_with_swap.csv'
df.to_csv(output_file, index=False)

print(f"Data with swap information saved as '{output_file}'.")


In [None]:
import pandas as pd

shuffled_data = pd.read_csv('shuffled_with_swap.csv')
human_result = pd.read_csv('human_survey.csv')
shuffled_data['Index'] = shuffled_data['Index'].astype(int)
human_result['index'] = human_result['index'].astype(int)
if not shuffled_data['Index'].is_unique:
    print("shuffled_with_swap.csv has duplicate Index")
if not human_result['index'].is_unique:
    print("human_survey.csv has duplicate Index")
data_merged = pd.merge(human_result, shuffled_data, left_on='index', right_on='Index', how='inner')

if not data_merged['index'].is_unique:
    print("has duplicate Index")

for student in ['student1', 'student2', 'student3', 'student4']:
    data_merged[student] = ((data_merged[student] == 'GQ1') & (data_merged['Swapped'] == 0)) | \
                           ((data_merged[student] == 'GQ2') & (data_merged['Swapped'] == 1))

data_merged[['student1', 'student2', 'student3', 'student4']] = data_merged[['student1', 'student2', 'student3', 'student4']].astype(int)

result = data_merged[['index', 'student1', 'student2', 'student3', 'student4']]

result.to_csv('human_result.csv', index=False)


In [None]:
import pandas as pd

# Load the human_result.csv file
human_result = pd.read_csv('/home/qiyu/Dev/ziqing/T5/human_result.csv')

# Create a list with each index duplicated immediately
duplicated_indices = []
for index in human_result['index']:
    duplicated_indices.extend([index, index])  # Appends the same index twice consecutively

# Create a new DataFrame using the list of duplicated indices
new_df = pd.DataFrame({
    'ID': duplicated_indices
})
# Add the 'Question Type' column with alternating 'rel' and 'non-rel' values
new_df['Question Type'] = ['rel' if i % 2 == 0 else 'non-rel' for i in range(len(new_df))]

# Add columns for 'human label', 'Relatedness Score', and 'BERTScore' with empty values for now
new_df['Human Label'] = ''
new_df['Relatedness Score'] = ''
new_df['BERTScore'] = ''

# Save to a new CSV file
new_csv_filename = 'human_individual.csv'
new_df.to_csv(new_csv_filename, index=False)

# Print the path to the saved file
print(f"New CSV file created and saved to {new_csv_filename}")


In [None]:
import pandas as pd

# Load the human_result.csv file
human_result = pd.read_csv('/home/qiyu/Dev/ziqing/T5/human_result.csv')

# Function to calculate the score based on the student responses
def calculate_score(row):
    if all(x == 1 for x in row):
        return (1, 0)  # All 1s, rel=1, non-rel=0
    else:
        return (0.75, 0.25)  # At least one 0, rel=0.75, non-rel=0.25

# Applying the function to each row and creating a new DataFrame for scores
human_result['scores'] = human_result.apply(lambda row: calculate_score(row[['student1', 'student2', 'student3', 'student4']]), axis=1)
scores_df = pd.DataFrame(human_result['scores'].tolist(), columns=['rel_score', 'non_rel_score'], index=human_result['index'])

# Load the new_human_result.csv file
new_human_result = pd.read_csv('/home/qiyu/Dev/ziqing/T5/human_individual.csv')

# Function to update the scores based on 'id' and 'Question Type'
def update_scores(row, scores_df):
    if row['Question Type'] == 'rel':
        return scores_df.loc[row['ID'], 'rel_score']
    else:
        return scores_df.loc[row['ID'], 'non_rel_score']

# Apply the update function
new_human_result['Human Label'] = new_human_result.apply(lambda row: update_scores(row, scores_df), axis=1)

# Save the updated new_human_result.csv
updated_csv_filename = 'human2_individual.csv'
new_human_result.to_csv(updated_csv_filename, index=False)

# Print the path to the saved file
print(f"Updated CSV file has been saved to {updated_csv_filename}")


In [None]:
import pandas as pd

# Load both datasets
bert_gq1_lq_scores = pd.read_csv('/home/qiyu/Dev/ziqing/T5/train/eval_newcsQGSE/T5_newcs_once/BERT30_gq1_lq.csv')
human_individual = pd.read_csv('/home/qiyu/Dev/ziqing/T5/human2_individual.csv')
bert_gq2_lq_scores = pd.read_csv('/home/qiyu/Dev/ziqing/T5/train/eval_newcsQGSE/T5_newcs_once/BERT30_gq2_lq.csv')

def update_bert_score1(row):
    if row['Question Type'] == 'rel':
        index = row['ID'] - 1 
        if 0 <= index < len(bert_gq1_lq_scores):
            return bert_gq1_lq_scores.iloc[index]['F1 Score']
    return row['BERTScore']

def update_bert_score2(row):
    if row['Question Type'] == 'non-rel':
        index = row['ID'] - 1 
        if 0 <= index < len(bert_gq2_lq_scores):
            return bert_gq2_lq_scores.iloc[index]['F1 Score']
    return row['BERTScore']

human_individual['BERTScore'] = human_individual.apply(update_bert_score1, axis=1)
human_individual['BERTScore'] = human_individual.apply(update_bert_score2, axis=1)

updated_csv_filename = 'human_individual.csv'
human_individual.to_csv(updated_csv_filename, index=False)