In [15]:
import csv 
import spacy
import tkinter as tk
from tkinter import filedialog


nlp = spacy.load("en_core_web_md")

In [16]:
# return list of sentences from dataset with duplicates eliminated 
# input format: [[int1,string],[int2,string],[int3,string]]
# output format: [sentence1,sentence2,sent....]
def process_data_from_csv(csv_reader):
    seen_sentences = set()
    result = []

    def get_sentences(text):
        sentences = text.split('. ')
        if sentences and sentences[-1].endswith('.'):
            sentences[-1] = sentences[-1][:-1]
        return sentences

    next(csv_reader, None)  # Skip header if present

    for row in csv_reader:
        text = row[1]  # second column contains the text

        sentences = get_sentences(text)
        for sentence in sentences:
            if sentence not in seen_sentences:
                seen_sentences.add(sentence)
                result.append(sentence)  

    return result

# Filters sentences by contextual relevance to keyword groups.
# Input - data: list of sentences e.g., ["sentence1", "sentence2", ...]
# Input - keywords: list of keyword groups e.g., [["word1", "word2"], ["word3"]]
# Output: list of lists; each sublist contains sentences matching each keyword group,
#         with non-matching positions filled with an empty string.
def filter_context_related_sentences(data, keyword_groups):
    # Initialize a list to store the final structured data
    filtered_data = []
    target_tokens_groups = [[nlp(keyword) for keyword in group] for group in keyword_groups]

    # Context filtering for sentences
    count = 1
    for sentence in data:
        print("Evaluating Sentence", count)
        count += 1
        doc = nlp(sentence)
        # Temporary storage for current sentence categorization
        sentence_categories = [[] for _ in keyword_groups]
        found_in_any_group = False
        
        for group_index, target_tokens in enumerate(target_tokens_groups):
            found = False
            for token in doc:
                for target_token in target_tokens:
                    if token.similarity(target_token) > 0.8:
                        sentence_categories[group_index].append(sentence)
                        found = True
                        found_in_any_group = True
                        break
                if found:
                    break
        
        # Only add the categorized sentences if it matches any group
        if found_in_any_group:
            combined = []
            for category in sentence_categories:
                if category:
                    combined.extend(category)
                else:
                    combined.append("")  # Append empty string if no match in this category
            filtered_data.append(combined)
    
    return filtered_data


# class to raise custom error
class InvalidInput(Exception):

    def __init__(self, message="Invalid input, restart program and re-enter"):
        self.message = message
        super().__init__(self.message)

# function to choose .csv(dataset file) via a dialog box
def choose_csv_file():
    root = tk.Tk()
    root.withdraw()  

    file_path = filedialog.askopenfilename(
        title="Select a CSV file",
        filetypes=[("CSV files", "*.csv")]
    )
    if file_path: 
        print(f"File selected: {file_path}")
    else:
        print("No file was selected.")

    return file_path

In [17]:
keywords = []
# Input number of categories for data extraction
numCategories = int(input("How many categories would you like your data extracted into? "))

# Input keyword(s) for each category
for x in range(numCategories):
    category = input("Enter keyword(s) for category "+ str(x+1) + " seperated by commas")
    category = category.split(",")
    keywords.append(category)

# Ensure user has left input empty 
if(keywords ==[] or [""] in keywords):
    raise InvalidInput()

# Select input/dataset/.csv file
selected_file = choose_csv_file()

# Create list of column headings for output .csv file
column_headings = []
for category in keywords:
    column_headings.append(category[0])

# read raw data from .csv/dataset
with open(selected_file, 'r') as file:
    csv_reader = csv.reader(file)
    next(csv_reader)  
    data = process_data_from_csv(csv_reader) 

filtered_data = filter_context_related_sentences(data,keywords)
print(filtered_data)

with open('clean_dataset.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(column_headings)  
    writer.writerows(filtered_data)

print("Data has been written to clean_dataset.csv successfully.")

File selected: /home/ahsan/Desktop/Jupyter/CoherentComposer/Data Sets/Mistral7B_CME_v1.csv
Evaluating Sentence 1
Evaluating Sentence 2
Evaluating Sentence 3
Evaluating Sentence 4
Evaluating Sentence 5
Evaluating Sentence 6
Evaluating Sentence 7
Evaluating Sentence 8
Evaluating Sentence 9


  if token.similarity(target_token) > 0.8:


Evaluating Sentence 10
Evaluating Sentence 11
Evaluating Sentence 12
Evaluating Sentence 13
Evaluating Sentence 14
Evaluating Sentence 15
Evaluating Sentence 16
Evaluating Sentence 17
Evaluating Sentence 18
Evaluating Sentence 19
Evaluating Sentence 20
Evaluating Sentence 21
Evaluating Sentence 22
Evaluating Sentence 23
Evaluating Sentence 24
Evaluating Sentence 25
Evaluating Sentence 26
Evaluating Sentence 27
Evaluating Sentence 28
Evaluating Sentence 29
Evaluating Sentence 30
Evaluating Sentence 31
Evaluating Sentence 32
Evaluating Sentence 33
Evaluating Sentence 34
Evaluating Sentence 35
Evaluating Sentence 36
Evaluating Sentence 37
Evaluating Sentence 38
Evaluating Sentence 39
Evaluating Sentence 40
Evaluating Sentence 41
Evaluating Sentence 42
Evaluating Sentence 43
Evaluating Sentence 44
Evaluating Sentence 45
Evaluating Sentence 46
Evaluating Sentence 47
Evaluating Sentence 48
Evaluating Sentence 49
Evaluating Sentence 50
Evaluating Sentence 51
Evaluating Sentence 52
Evaluating 

In [14]:
len(filtered_data[-1])

2