# Chapter 43 - AutoML NLP
## Building Machine Learning and Deep Learning Models on Google Cloud Platform
### Ekaba Bisong

In [0]:
import numpy as np
import pandas as pd
import re
import pathlib
import os

In [0]:
data = pd.read_csv('./data/train.csv')

In [0]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [0]:
# add clean column label
data['clean'] = (1 - data.iloc[:, 2:].sum(axis=1) >= 1).astype(int)

In [0]:
# merge all other non-clean commnents to toxic
data.loc[data['clean'] == 0, ['toxic']] = 1

In [0]:
# select dataframe of clean examples
data_clean = data[data['clean'] == 1].sample(n=20000)

In [0]:
# select dataframe of toxic examples
data_toxic = data[data['toxic'] == 1].sample(n=16000)

In [0]:
# join into one dataframe
data = pd.concat([data_clean, data_toxic])

In [0]:
# remove unused columns
data.drop(['severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1, inplace=True)

In [0]:
# data.head(30)

In [0]:
for index, row in data.iterrows():
    comment_text = re.sub(r'[^\w\s]','',row['comment_text']).rstrip().lstrip().strip()
    classes = ''
    if (row['toxic'] == 1):
        classes = 'toxic'
    else:
        classes = 'clean'
    
    pathlib.Path("./file/{}".format(classes)).mkdir(parents=True, exist_ok=True) 
    with open("./file/{}/text_{}.txt".format(classes,index), "w") as text_file:
        text_file.write(comment_text)

In [0]:
data_path = []
directory = 'file/'

In [0]:
# create data csv
for subdir, dirs, files in os.walk(directory):
    for file in files:
        filepath = subdir + os.sep + file

        if filepath.endswith(".txt"):
            entry = ['{}/{}'.format('gs://quantum-ally-219323-lcm',filepath), os.path.basename(subdir)]
            data_path.append(entry)

In [0]:
# convert to Pandas DataFrame
data_pd = pd.DataFrame(np.array(data_path))

In [0]:
# export data to csv
data_pd.to_csv("data.csv", header=None, index=None)