In [1]:
import sys
sys.path.append("../../Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages")

In [10]:
class Dataset:
    '''
    Wraps the Python Dictionary built-in class with some extra functions
    -Saves original dataset for reverting
    -Saves a root directory for the dataset for re-use
    -Unique file name saving per dataset
    Dictionary format
    {
        Unique_ID: [TEXT1, TEXT2, ETC],
        Unique_ID: [TEXT1, TEXT2, ETC],
        Unique_ID: [ETC]
    }
    '''
    _data : dict
    _original_data : dict
    _root_dir = "."
    # List of strings to add to file name
    _mutations: list

    def __init__(self, data: dict, mutations: list):
        self._data = data
        self._original_data = data
        self._mutations = mutations

    def resetData(self):
        self._data = self._original_data
        self._mutations = []

    def saveDataMutation(self, mutation_append: list):
        self._mutations.append(mutation_append)

    def _determineFileName(self) -> str:
        base_name = ""
        for mutation in self._mutations:
            base_name = base_name + mutation
        file_name = base_name + ".json"
        files_in_directory = os.listdir(self._root_dir)
        for i in range(1, 10_000):
            if (file_name not in files_in_directory):
                break
            if (file_name in files_in_directory):
                file_name = base_name + str(i) + ".json"
        return self._root_dir + file_name

    def setRootDir(self, root: str):
        self._root_dir = root

    def saveToFile(self):
        with open(self._determineFileName(), "w") as outfile:
            json.dump(self._data, outfile)

    #Behave as a dictionary
    def keys(self) -> list:
        return self._data.keys()

    def values(self) -> list:
        return self._data.values()

    def items(self) -> tuple:
        return self._data.items()

    #Operator overloads
    def __str__(self):
        return str(self._data)
    def __setitem__(self, key, value):
        self._data[key] = value
    def __getitem__(self, key):
        return self._data[key]

In [11]:

def replaceSubString(dataset : Dataset, substring, replacement, count=-1):
    for key, texts in dataset.items():
        textsBuffer = []
        for text in texts:
            texts : str
            textsBuffer.append(text.replace(substring, replacement, count))
        dataset[key] = textsBuffer
        
import re
def replaceWords(dataset : Dataset, word_list : dict, count=1):
    for key, texts in dataset.items():
        textsBuffer = []
        for text in texts:
            text : str
            text = text.replace("[", "").replace("]", "").replace("'", "").replace('"', "")
            amountToReplace = int(count / 2)
            for word, replacement in word_list.items():
                print("." + str(amountToReplace))
                (text, numReplaced) = re.subn(word, replacement, text, amountToReplace)
                amountToReplace -= numReplaced
                print(str(amountToReplace))
                if amountToReplace <= 0:
                    break
            textsBuffer.append(text)
        dataset[key] = textsBuffer

import random
def replaceWordsByDoubleList(dataset : Dataset, word_list : list, count=1):
    for key, texts in dataset.items():
        textsBuffer = []
        for text in texts:
            text : str
            text = text.replace("[", "").replace("]", "").replace("'", "").replace('"', "")
            amountToReplace = count
            words = text.split(" ")
            for word in words:
                if(word in word_list):
                    (text, numReplaced) = re.subn(word, random.choice(word_list), text, amountToReplace)
                    amountToReplace -= numReplaced
                if amountToReplace <= 0:
                    break
            textsBuffer.append(text)
        dataset[key] = textsBuffer

import requests, json
def getSynonymAPI(word) -> str:
    with open("./mutation_data/synonyms.json", "r+") as f:
        local_synonyms = json.load(f)
        if word in local_synonyms:
            if len(local_synonyms[word]) == 0:
                return word
            return local_synonyms[word][0]
        url = f"https://wordsapiv1.p.rapidapi.com/words/{word}/synonyms"

        headers = {
            "X-RapidAPI-Key": "41b9c2ee17msh295225a18398362p1c732cjsn4afb01c0b61f",
            "X-RapidAPI-Host": "wordsapiv1.p.rapidapi.com"
        }

        response = requests.request("GET", url, headers=headers)
        print(dict(response.json()))
        synonyms=[]
        try:
            synonyms = response.json()['synonyms']
        except KeyError:
            synonyms = []
        local_synonyms[word] = synonyms
        f.seek(0)
        f.write((json.dumps(local_synonyms)))
        f.truncate()
        if len(synonyms) == 0:
            return word
        return synonyms

def getAntonymAPI(word) -> str:
    with open("./mutation_data/antonyms.json", "r+") as f:
        local_antonyms = json.load(f)
        if word in local_antonyms:
            if len(local_antonyms[word]) == 0:
                return word
            return local_antonyms[word][0]
        url = f"https://wordsapiv1.p.rapidapi.com/words/{word}/antonyms"

        headers = {
            "X-RapidAPI-Key": "41b9c2ee17msh295225a18398362p1c732cjsn4afb01c0b61f",
            "X-RapidAPI-Host": "wordsapiv1.p.rapidapi.com"
        }

        response = requests.request("GET", url, headers=headers)
        antonyms = []
        try:
            antonyms = response.json()['antonyms']
        except KeyError:
            antonyms = []
        local_antonyms[word] = antonyms
        f.seek(0)
        f.write((json.dumps(local_antonyms)))
        f.truncate()
        if len(antonyms) == 0:
            return word
        return antonyms[0]



#FromAPI
def getRandomWordAPI() -> str:
    url = "https://wordsapiv1.p.rapidapi.com/words/"
    querystring = {"random":"true"}
    headers = {
        "X-RapidAPI-Key": "41b9c2ee17msh295225a18398362p1c732cjsn4afb01c0b61f",
        "X-RapidAPI-Host": "wordsapiv1.p.rapidapi.com"
    }
    response = requests.request("GET", url, headers=headers, params=querystring)
    return response.json()["word"]

#From json files
import random
def getRandomWordJSON() -> str:
    with open("./mutation_data/random_word.json", "r") as file:
        words = dict(json.load(file))['word']
        words : list
        return random.choice(words)

def getRandomVerbJSON() -> str:
    with open("./mutation_data/random_verbs.json", "r") as file:
        verbs = dict(json.load(file))['verb']
        verbs : list
        return random.choice(verbs)


def getRandomAdverbJSON() -> str:
    with open("./mutation_data/random_adverbs.json", "r") as file:
        adverbs = dict(json.load(file))['adverb']
        adverbs : list
        return random.choice(adverbs)

def getRandomAdjectiveJSON() -> str:
    with open("./mutation_data/random_adjectivea.json", "r") as file:
        adjectives = dict(json.load(file))['adjective']
        adjectives : list
        return random.choice(adjectives)

#TODO: Might be too large a set of misspellings
def getMisspellListJSON() -> dict:
    with open("./mutation_data/misspellings.json", "r") as file:
        misspellings = dict(json.load(file))
        actualMispells = {}
        for word, missSpells in misspellings.items():
            if len(missSpells) > 0:
                actualMispells[word] = random.choice(missSpells)
        return actualMispells

def populateRandomWord(count = 1):
    with open("./mutation_data/random_word.json", "r+") as f:
        local_words = json.load(f)
        if "word" not in local_words:
            local_words = {"word" : []}
        for i in range(0, count):
            word = getRandomWordAPI()
            if word in local_words["word"]:
                continue
            local_words["word"].append(word)
        f.seek(0)
        f.write((json.dumps(local_words)))
        f.truncate()
    

ModuleNotFoundError: No module named 'requests'

In [None]:

def replaceFromDictionary(dataset : Dataset, word_list : dict, mutation="misspell", word_change_limit=1):
    replaceWords(dataset, word_list, word_change_limit)
    dataset.saveDataMutation(mutation)
'''
Takes an article and adds spaces between to replace
'''
def replaceArticles(dataset : Dataset, articles : dict, mutation="articleSub", word_change_limit=1):
    replaceWords(dataset, articles, word_change_limit)
    dataset.saveDataMutation(mutation)

def replaceLetters(dataset : Dataset, articles : dict, mutation="letterReplace", word_change_limit=1):
    replaceWords(dataset, articles, word_change_limit)
    dataset.saveDataMutation(mutation)

def replaceSynonyms(dataset : Dataset, words_to_replace : list, mutation="synonymSub", word_change_limit=1):
    word_list = {}
    for word in words_to_replace:
        word_list[word] = getSynonymAPI(word)
    replaceWords(dataset, word_list, word_change_limit)
    dataset.saveDataMutation(mutation)

def replaceInTextsRandomSynonymAPI(dataset : Dataset, mutation="randSynonym", word_change_limit=1):
    word_list = {}
    for key, texts in dataset.items():
        for word in texts.split(" "):
            word_list[word] = getSynonymAPI(word)
    replaceWords(dataset, word_list, word_change_limit)
    dataset.saveDataMutation(mutation)

def replaceInTextsAntonymAPI(dataset : Dataset, words_to_replace : list, mutation="antonym", word_change_limit=1):
    word_list = {}
    for word in words_to_replace:
        word_list[word] = getAntonymAPI(word)
    replaceWords(dataset, word_list, word_change_limit)
    dataset.saveDataMutation(mutation)

def replaceInTextsRandomAntonymAPI(dataset : Dataset, mutation="randAntonym", word_change_limit=1):
    word_list = {}
    for key, texts in dataset.items():
        for word in texts.split(" "):
            word_list[word] = getAntonymAPI(word)
    replaceWords(dataset, word_list, word_change_limit)
    dataset.saveDataMutation(mutation)

'''
Replaces an adjective with another
'''
def replaceInTextsRandomAdjective(dataset : Dataset, word_change_limit=1):
    pass

'''
Replaces a verb with another
'''
def replaceInTextsRandomVerb(dataset : Dataset, word_change_limit=1):
    pass

def removeStartingArticles(dataset : Dataset):
    for key, texts in dataset.items():
        textsBuffer = []
        for text in texts:
            text : str
            text = text.replace("[", "").replace("]", "").replace("'", "").replace('"', "")
            wordsList = text.split(" ")
            if wordsList[0].lower() == "a" or wordsList[0].lower() == "the" or wordsList[0].lower() == "an":
                wordsList = wordsList[1:]
            text = ' '.join(wordsList)
            textsBuffer.append(text)
        dataset[key] = textsBuffer

def deleteRandomArticle(dataset : Dataset, articles : list, mutation="delArticles", word_change_limit=1):
    word_list = {}
    for article in articles:
        word_list[article] = " "
    replaceWords(dataset, word_list, word_change_limit)
    removeStartingArticles(dataset)
    dataset.saveDataMutation(mutation)

def replaceWordListWithRandomSelf(dataset : Dataset, random_words : list, mutation="randWord", word_change_limit=1):
    replaceWordsByDoubleList(dataset, random_words, word_change_limit)
    dataset.saveDataMutation(mutation)