##  Installing Libraries and Packages

In [None]:
!pip install detoxify



In [None]:
from detoxify import Detoxify
from tqdm import tqdm
import torch
import os
import pandas as pd
import math

## Function to Import Data files in a Folder

In [None]:
def import_data(import_folder:str):
    """Walks through the import folder and yields a list of dicts for each file.
    Will only process json, csv, xlsx and txt files.

    Args:
        import_folder (str, optional): The folder holding the data. Defaults to "Import".

    Yields:
        [list]: list of dictionaries, representing each row from the file
    """
    for _, _, fnames in os.walk(import_folder):
        with tqdm(total=len([x for x in fnames if '.json' in x or '.xlsx' in x or '.csv' in x])) as pbar:
            for fname in fnames:
                pbar.set_description(fname.replace('.json',''))
                if '.json' in fname:
                    df = pd.read_json(f"{import_folder}/{fname}")
                    data = df.T.to_dict().values()
                    yield data, fname
                elif '.xlsx' in fname:
                    df = pd.read_excel(f"{import_folder}/{fname}", engine='openpyxl')
                    data = df.T.to_dict().values()
                    del df
                    yield data, fname
                elif '.csv' in fname:
                    df = pd.read_csv(f"{import_folder}/{fname}")
                    data = df.T.to_dict().values()
                    del df
                    yield data, fname
                elif '.txt' in fname:
                    print("Text FileName is ", f"{import_folder}/{fname}")
                    df = pd.read_csv(f"{import_folder}/{fname}", names=['text'], sep="\n", header=None)
                    data = df.T.to_dict().values()
                    del df
                    # print("Dictionary  ", data)
                    yield data, fname
                else:
                    print("Not supported Format ", f"{import_folder}/{fname}")
                    pass
                pbar.update(1)

## Function to Clean the Text Column - with `@` and `.com`

In [None]:
def clean_text(text:str):
    if type(text) == float:
        return None
    if text and ''.join(text.split()):
        if type(text) == bytes: #Decoding byte strings
            text = text.decode('utf-8')
        #Removing emails + ***.com urls
        text = ' '.join([item for item in text.split() if '@' not in item and '.com' not in item])
        text = ' '.join(text.split()) #removing all multiple spaces
        if text: return text
    return None


## Function to Break the List of Data with Items per Chunk

In [None]:
def chunk_lst(lst:list, items_per_chunk:int):
    """Breaks a list into chunks
    Args:
        lst ([list]): List to chunk
        items_per_chunk ([int]): Number of items per list
    Yields:
        [list]: a chunk of lst, with size 'items_per_chunk'
    """
    for i in range(0, len(lst), items_per_chunk):
        yield lst[i:i + items_per_chunk]


## Function to Calculate Detoxicity of Text

In [None]:
from detoxify import Detoxify
results = Detoxify('unbiased').predict(['Pass it on.','Please do not skip any regulatory steps or licensing requirements'])
print("results Dictionary ", results)
print("results only Toxicity ", results.get('toxicity'))

results Dictionary  {'toxicity': [0.012605313211679459, 0.00037768384208902717], 'severe_toxicity': [7.25553763913922e-06, 1.300658027503232e-06], 'obscene': [0.0002587903290987015, 2.0923429474350996e-05], 'identity_attack': [0.0004194467328488827, 7.918340998003259e-05], 'insult': [0.0007003389182500541, 0.00010092252341564745], 'threat': [0.003826275235041976, 1.8575134163256735e-05], 'sexual_explicit': [6.774796202080324e-05, 1.2177173630334437e-05]}
results only Toxicity  [0.012605313211679459, 0.00037768384208902717]


In [None]:
def get_scores(text, model):
    with torch.no_grad():
        print("Text Size - ", len(text))
        if type(text) == str:
            result = model.predict(text)
            return result.get('toxicity')
        elif type(text) == list:
            results = []
            chunk_size = 100
            for l in tqdm(chunk_lst(text, chunk_size), desc="Processing Scores", total=math.ceil(len(text)/chunk_size)):
                un_formated_results = model.predict(l)
                print("un_formated_results Dictionary ", un_formated_results)
                results += un_formated_results.get('toxicity')
        return results


## Function to Process data and write the Results. 

In [None]:
def process_data(data:list, text_columns:list, model:Detoxify) ->list:
    """Computes the toxicity score for a batch of data.
    The toxicity score will be added into each dictionary as 'text_column + "_toxicity"'

    Args:
        data (list): list of dictionaries
        text_column (list): the id of the text column you want to compute
        model (Detoxify): the detoxify model you wish to use

    Returns:
        list: list of dictionaries
    """
    #Cleaning text
    for text_column in text_columns:
        texts = []
        for row in data:
            cleaned_text = clean_text(row[text_column])
            text =  cleaned_text if cleaned_text else ''
            texts.append(text)
        results = get_scores(texts, model)
        print("results length: ", len(results))
    return results

## Input Parameters for the SimpleToxicity Calcultor 

In [None]:
model = Detoxify("unbiased", device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")) #device='cude'
process_sentiment = True
#Set output format
select_output_format = 3
output_formats = ['json', 'csv', 'xlsx']
#Text column name
text_columns = ["text"]

## Run the code 
1.   Input - import_folder -  The folder holding the data. 
2.   Output - filename_toxicity.csv 



In [46]:
def process_file_toxicity(folder_path: str):
  for data, fname in import_data(folder_path):
      updated_data = process_data(data, text_columns, model)
      print("List Dictionary ", updated_data, fname)
      df = pd.DataFrame(updated_data)
      # Write toxicity score to csv file appending filename + '_toxicity'
      save_fname = fname.split('.')[0]
      df.to_csv(f"{folder_path}/{save_fname}_toxicity.csv")


process_file_toxicity("sample_data/sample")

sample_text.txt: : 0it [00:00, ?it/s]

Text FileName is  sample_data/sample/sample_text.txt
Text Size -  650



Processing Scores:   0%|          | 0/7 [00:00<?, ?it/s][A
Processing Scores:  14%|█▍        | 1/7 [00:38<03:52, 38.79s/it][A

un_formated_results Dictionary  {'toxicity': [0.006176656112074852, 0.09421318024396896, 0.9876813888549805, 0.5534142851829529, 0.0010689786868169904, 0.0006262899842113256, 0.04457143694162369, 0.0019281518179923296, 0.0009448841447010636, 0.003355340799316764, 0.003069218248128891, 0.005304948892444372, 0.9554316997528076, 0.03878629952669144, 0.0016195171047002077, 0.9327911734580994, 0.006107184570282698, 0.005304201040416956, 0.004267391283065081, 0.07490035146474838, 0.021986693143844604, 0.00469698291271925, 0.9801448583602905, 0.7687169313430786, 0.0019492259016260505, 0.07617585361003876, 0.00048348598647862673, 0.000625847838819027, 0.0015811769990250468, 0.0006873845122754574, 0.0007043851655907929, 0.0007474974263459444, 0.0005855009076185524, 0.0011946685845032334, 0.0017894640332087874, 0.0006001638830639422, 0.019388563930988312, 0.6862991452217102, 0.3684613108634949, 0.0012777973897755146, 0.000641746271867305, 0.0007797820726409554, 0.0005577397532761097, 0.891021549


Processing Scores:  29%|██▊       | 2/7 [01:21<03:24, 40.84s/it][A

un_formated_results Dictionary  {'toxicity': [0.0010568317957222462, 0.0010491267312318087, 0.027461417019367218, 0.0016222663689404726, 0.0021731641609221697, 0.0008404463296756148, 0.0009201957145705819, 0.0009201957145705819, 0.001914067193865776, 0.0003736741200555116, 0.006001570262014866, 0.0024392458144575357, 0.0018137159058824182, 0.0004493366286624223, 0.04778405651450157, 0.20412519574165344, 0.0009302176767960191, 0.08664850145578384, 0.9738945960998535, 0.0016029648249968886, 0.0008165144245140254, 0.6994858384132385, 0.0009009598288685083, 0.006407991051673889, 0.0011705379001796246, 0.0005598582210950553, 0.004318682476878166, 0.009050820022821426, 0.010316797532141209, 0.48252108693122864, 0.011582940816879272, 0.43908342719078064, 0.6685967445373535, 0.004297694191336632, 0.0026444534305483103, 0.0005617372808046639, 0.001122973277233541, 0.0008233119733631611, 0.0005721118068322539, 0.0008375814068131149, 0.00452417554333806, 0.004551885649561882, 0.004326388705521822


Processing Scores:  43%|████▎     | 3/7 [02:00<02:41, 40.29s/it][A

un_formated_results Dictionary  {'toxicity': [0.0420452281832695, 0.014418204315006733, 0.0017877162899821997, 0.08564719557762146, 0.0008087638998404145, 0.9648077487945557, 0.0007242288556881249, 0.26829850673675537, 0.009068709798157215, 0.0007344034384004772, 0.0010666664456948638, 0.0010562470415607095, 0.9873326420783997, 0.9830606579780579, 0.009608389809727669, 0.010953138582408428, 0.00044114841148257256, 0.00040187948616221547, 0.010975496843457222, 0.05273199826478958, 0.0010036444291472435, 0.004102607257664204, 0.0010486120590940118, 0.011657201685011387, 0.010518901981413364, 0.0011043872218579054, 0.001454137614928186, 0.0014720605686306953, 0.03146684169769287, 0.04468024894595146, 0.004004013258963823, 0.001992173958569765, 0.0033723851665854454, 0.0037714787758886814, 0.0004760446317959577, 0.00045478733954951167, 0.0006555419531650841, 0.005441403482109308, 0.0010906049283221364, 0.004734528250992298, 0.0004311962111387402, 0.0009757101652212441, 0.003684896742925048


Processing Scores:  57%|█████▋    | 4/7 [02:33<01:51, 37.28s/it][A

un_formated_results Dictionary  {'toxicity': [0.004018821753561497, 0.944233775138855, 0.0013218875974416733, 0.0007771027158014476, 0.9292265772819519, 0.0007077670888975263, 0.0007987204589881003, 0.018700607120990753, 0.8862625360488892, 0.0065567223355174065, 0.0011747326934710145, 0.0011877053184434772, 0.0006243227398954332, 0.006773699074983597, 0.010430989786982536, 0.0008777107577770948, 0.20895689725875854, 0.0008981904829852283, 0.0007750256918370724, 0.3262259066104889, 0.001354553154669702, 0.0029736014548689127, 0.0011608629720285535, 0.0077625117264688015, 0.3534761369228363, 0.0020531504414975643, 0.0015024722088128328, 0.0011580195277929306, 0.010234169661998749, 0.02123863808810711, 0.0013220588443800807, 0.0036354530602693558, 0.0026666740886867046, 0.002361973049119115, 0.23264843225479126, 0.004727765917778015, 0.042401961982250214, 0.11837947368621826, 0.002170204883441329, 0.01528274267911911, 0.006383214145898819, 0.002854942576959729, 0.01006135530769825, 0.016


Processing Scores:  71%|███████▏  | 5/7 [03:39<01:35, 47.55s/it][A

un_formated_results Dictionary  {'toxicity': [0.00264122080989182, 0.0017241511959582567, 0.001609350205399096, 0.0006924783228896558, 0.003010596614331007, 0.0006255663465708494, 0.000786977238021791, 0.0032901151571422815, 0.005383170675486326, 0.0007199427927844226, 0.0015725619159638882, 0.08902355283498764, 0.024215789511799812, 0.00732064712792635, 0.008106092922389507, 0.0006573318969458342, 0.0013945158571004868, 0.19483131170272827, 0.0012334877392277122, 0.054241009056568146, 0.002164045814424753, 0.006075454410165548, 0.3527185022830963, 0.036945559084415436, 0.11847197264432907, 0.0005703667993657291, 0.001483110710978508, 0.000790897523984313, 0.0004297071136534214, 0.04432373121380806, 0.0019082104554399848, 0.013834069482982159, 0.000491725339088589, 0.009566977620124817, 0.0007374932174570858, 0.0009596024174243212, 0.0007870803237892687, 0.44576627016067505, 0.06259220838546753, 0.0015287887072190642, 0.006447039544582367, 0.008579326793551445, 0.021411728113889694, 0.


Processing Scores:  86%|████████▌ | 6/7 [04:10<00:42, 42.06s/it][A

un_formated_results Dictionary  {'toxicity': [0.001334231928922236, 0.0008730939007364213, 0.020565146580338478, 0.04532483220100403, 0.030708536505699158, 0.001577033195644617, 0.0022877489682286978, 0.002332555828616023, 0.26881206035614014, 0.0022074023727327585, 0.0016210675239562988, 0.0018955591367557645, 0.4145672917366028, 0.06701108068227768, 0.011453869752585888, 0.0012598454486578703, 0.07570558786392212, 0.000810476893093437, 0.0007216408266685903, 0.04786790907382965, 0.00143322569783777, 0.4542599618434906, 0.9790076017379761, 0.007183115929365158, 0.000853683624882251, 0.01259012334048748, 0.015126976184546947, 0.0030668466351926327, 0.0016547981649637222, 0.001494147814810276, 0.01738060638308525, 0.10246945917606354, 0.002776416717097163, 0.021237554028630257, 0.029860222712159157, 0.009068709798157215, 0.013326013460755348, 0.0007603423437103629, 0.975531280040741, 0.002516945358365774, 0.9918158650398254, 0.0013118614442646503, 0.016999734565615654, 0.001130874850787


Processing Scores: 100%|██████████| 7/7 [04:29<00:00, 38.54s/it]
sample_text.txt: : 1it [04:29, 269.92s/it]

un_formated_results Dictionary  {'toxicity': [0.011766381561756134, 0.014106383547186852, 0.1688280552625656, 0.0015720183728262782, 0.004991237074136734, 0.001921459217555821, 0.004265221767127514, 0.007135974708944559, 0.2867707312107086, 0.007375153712928295, 0.17124047875404358, 0.04990343004465103, 0.04751679673790932, 0.0025050987023860216, 0.9743623733520508, 0.003568044863641262, 0.000446880905656144, 0.47795984148979187, 0.014500615186989307, 0.01953447051346302, 0.0022588518913835287, 0.00394114013761282, 0.01094344723969698, 0.0013423266354948282, 0.0034708271268755198, 0.854538083076477, 0.015490734949707985, 0.07870922982692719, 0.00436824094504118, 0.9672456383705139, 0.000808783108368516, 0.06233867257833481, 0.005240981467068195, 0.0029085250571370125, 0.0004279481072444469, 0.0011158745037391782, 0.04163290187716484, 0.001471393508836627, 0.4422164857387543, 0.004973712842911482, 0.0006931840907782316, 0.014394182711839676, 0.1561122089624405, 0.4194512367248535, 0.315




## Save toxicity Values to CSV files

In [None]:
save_fname = fname.split('.')[0]
df.to_csv(f"{save_fname}_toxicity.csv")

In [None]:
## cleantext() 
len(updated_data)

650