In [1]:
# importing libraries
import pandas as pd
import os
from dotenv import load_dotenv

# loading environment variables
load_dotenv()
DATA_PATH = os.getenv('DATA_PATH')

In [2]:
# loading data
df = pd.read_parquet(os.path.join(DATA_PATH, '../processed/dsi-qa-pairs.parquet'))
df.head()

Unnamed: 0,question,answer
0,"During the COVID-19 pandemic, ns and judgment ...",stress loss data and experienced the power of ...
1,mmons spent the past five years in the Risk Ma...,Sydney Simmons
2,What is the name of the inaugural recipient of...,Sydney Simmons
3,Women in Technology of Tennessee (WiTT) is exc...,Sydney Simmons the inaugural recipient of the ...
4,What is the name of the program at Vanderbilt?,Data Science Master’s ProgramThe data science ...


In [3]:
from datasets import Dataset

# Format the data
def format_conversation(row):
    return [
        {"role": "system", "content": "You are an assistant."},
        {"role": "user", "content": row["question"]},
        {"role": "assistant", "content": row["answer"]},
    ]

df["conversations"] = df.apply(format_conversation, axis=1)

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df[["conversations"]])


In [4]:
dataset.save_to_disk(os.path.join(DATA_PATH, '../processed/dsi-qa-pairs-instruct.hf'))

Saving the dataset (0/1 shards):   0%|          | 0/581 [00:00<?, ? examples/s]

### Using ChatGPT generated data


In [2]:
gpt_data = pd.read_parquet(os.path.join(DATA_PATH, '../processed/chat-gpt-qa-pairs.parquet'))
gpt_data.columns = ['topic', 'question', 'answer']
gpt_data.head()

Unnamed: 0,topic,question,answer
0,WiTT awards first annual 50% tuition scholarsh...,What is the main subject or theme of this topic?,The main theme revolves around the WiTT awards...
1,WiTT awards first annual 50% tuition scholarsh...,Who is the target audience for the WiTT awards...,The target audience includes women pursuing ca...
2,WiTT awards first annual 50% tuition scholarsh...,What is the significance of awarding 50% tuiti...,It helps in reducing financial barriers for wo...
3,WiTT awards first annual 50% tuition scholarsh...,How does this initiative impact the field of t...,It encourages more women to enter and thrive i...
4,WiTT awards first annual 50% tuition scholarsh...,What are potential challenges in implementing ...,Challenges could include securing consistent f...


In [6]:
# Apply the format_conversation function to each row in gpt_data to create a 'conversations' column
gpt_data['conversations'] = gpt_data.apply(format_conversation, axis=1)

# Convert the DataFrame to a Hugging Face Dataset object
gpt_dataset = Dataset.from_pandas(gpt_data[['conversations']])

# Save the dataset to disk
gpt_dataset.save_to_disk(os.path.join(DATA_PATH, '../processed/chat-gpt-qa-pairs-instruct.hf'))

Saving the dataset (0/1 shards):   0%|          | 0/1710 [00:00<?, ? examples/s]

What did the DSI-SRP fellowship grant Jingyi Chen to work in the laboratory of Christopher Candelaria?<br>
The AI model produces novel text that takes form of written correspondence, with style and content consistent with the data source?

In [7]:
gpt_dataset

Dataset({
    features: ['conversations'],
    num_rows: 1710
})

In [8]:
# Split the dataset into train and test sets
split_datasets = gpt_dataset.train_test_split(test_size=0.2, seed=42)

# Access the train and test sets
train_dataset = split_datasets['train']
test_dataset = split_datasets['test']

# Display sizes for confirmation
print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Train dataset size: 1368
Test dataset size: 342


In [9]:
train_dataset.save_to_disk(os.path.join(DATA_PATH, '../processed/chat-gpt-qa-pairs-instruct-train.hf'))
test_dataset.save_to_disk(os.path.join(DATA_PATH, '../processed/chat-gpt-qa-pairs-instruct-test.hf'))

Saving the dataset (0/1 shards):   0%|          | 0/1368 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/342 [00:00<?, ? examples/s]

In [11]:
qa_df = pd.read_parquet(os.path.join(DATA_PATH, '../processed/all_topics_questions_answers.parquet'))
qa_df.head()

Unnamed: 0,question,answer
0,"What is the WiTT Scholarship, and who was its ...",The WiTT Scholarship is a 50% tuition scholars...
1,What organization partnered with Vanderbilt’s ...,Women in Technology of Tennessee (WiTT) partne...
2,What motivated Sydney Simmons to pursue a Data...,Her experience in Risk Management at Morgan St...
3,What are the key pillars of WiTT's mission tha...,Networking and community outreach are key pill...
4,How has WiTT supported women in technology fie...,"WiTT has awarded over $200,000 in scholarships..."
