### Import the necessary libraries

In [1]:
#Import the necessary libraries

import openai
import typing
from typing import List, Tuple

import tiktoken
import ast

import pandas as pd
import time

### Data ingestion and pre-processing

In [None]:
import pandas as pd

# Load the data
data = pd.read_csv('/mnt/data/sample_data.csv')

# Combine 'concerns' and 'anything else' columns
data['combined_text'] = data['concerns'].fillna('') + ' ' + data['anything else'].fillna('')

# Mapping for categories
label_map = {'AC': 0, 'TC': 1, 'PC': 2, 'NC': 3}

# Apply mapping to the categories
data['concerns_category'] = data['concerns_category'].map(label_map)
data['anything_else_category'] = data['anything_else_category'].map(label_map)

# Final label as the max of both categories (more critical concern)
data['label'] = data[['concerns_category', 'anything_else_category']].max(axis=1)


### Creation of text files

In [4]:
import pandas as pd

# Step 1: Load the CSV file
file_path = 'merged_datafile.csv'

# Try different encodings if the default 'utf-8' fails
encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']

for encoding in encodings:
    try:
        df = pd.read_csv(file_path, encoding=encoding)
        print(f"File successfully read with encoding: {encoding}")
        break
    except UnicodeDecodeError:
        print(f"Failed to read file with encoding: {encoding}")

# Display the first few rows of the dataset
print(df.head())

Failed to read file with encoding: utf-8
File successfully read with encoding: latin1
                                            concerns concerns category  \
0  That all of my knowledge from calc BC escapes ...                AC   
1  My only concern about this course is that I wi...                AC   
2  My only concern is if I'll be able to study/pr...                AC   
3  My only concern is that so far the video lesso...                AC   
4  One thing that was concerning for me last seme...                AC   

                                       anything else anything else category  
0  The sample exams and quizzes during linear alg...                     AC  
1                                                NaN                     NC  
2                                                NaN                     NC  
3                                                NaN                     NC  
4                                                NaN                     NC  


In [5]:
data = df

# Step 2: Create dictionaries to store responses for each label
responses = {
    'AC': [],
    'PC': [],
    'TC': [],
    'NC': []
}

# Step 3: Extract and group data
for _, row in data.iterrows():
    concerns_label = row['concerns category']
    anything_else_label = row['anything else category']
    
    if concerns_label in responses:
        if pd.notna(row['concerns']):
            responses[concerns_label].append(row['concerns'])
    if anything_else_label in responses:
        if pd.notna(row['anything else']):
            responses[anything_else_label].append(row['anything else'])

# Step 4: Write responses to text files in list format
for label, texts in responses.items():
    with open(f'{label}.txt', 'w') as file:
        file.write(str(texts))

print("Text files created successfully.")

Text files created successfully.


### Create a shuffled dataset

In [None]:
def read_list_from_file(file_path):
    try:
        with open(file_path, 'r') as file:
            # Read the file content as a string
            file_content_str = file.read()

            # Safely evaluate the string as a Python literal (list)
            file_contents = ast.literal_eval(file_content_str)

        return file_contents

    except FileNotFoundError:
        print(f"The file '{file_path}' does not exist.")
    except (SyntaxError, ValueError) as e:
        print(f"Error while evaluating the file content: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")

def df_from_file(file_path):
    listy = read_list_from_file(file_path)
 
    category = file_path.replace(".txt", "")
    df = pd.DataFrame({'response': listy, 'category': [category] * len(listy)})
    return df
#Files is a list of files to add it from. Put it all in one dataframe
def multiple_df(files):
    dfs = []
    for file in files:
        dfs.append(df_from_file(file))
    
    result_df = pd.concat(dfs, ignore_index=True)
    return result_df


ret = multiple_df(["../Data/AC.txt", "../Data/PC.txt", "../Data/TC.txt", "../Data/NC.txt"])
shuffled_df = ret.sample(frac=1).reset_index(drop=True)
shuffled_df