In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Data Cleaning and Formatting for BERT

BERT has several requirements for it's input.  


3 .tsv files are required.
- train.tsv (no header)
- dev.tsv (evaluation, no header)
- test.tsv (header is required)  

For *train.tsv* and *dev.tsv*, 4 columns must exist
1. Column 1: The guid. Any value that is observation unique
2. Column 2: The label. This is a string, but numbers can be used as well
3. Column 3: Untokenized text of second sequence if doing sequence pair tasks (optional)
4. Column 4: Untokenized text of first sequence (required)

In [2]:
# Function to clean html tags
def soup(text):
    soup = BeautifulSoup(text, 'html.parser')
    clean_text = soup.get_text()
    return clean_text

In [3]:
def clean_dataset(filename):
    """Takes in a filename (string) and returns a cleaned dataset"""
    # Import the dataset
    df_full = pd.read_csv(filename, sep='\t')
    print("Original Size:", df_full.shape)
    
    # Concatenate title to post body text
    df_full["full_text"] = df_full["title"] + " " + df_full["selftext"]
    
    # Extract 1/10 of the dataset 
    df = df_full.sample(frac=0.1, random_state=7)
    print("Sampled Size:", df.shape)
    
    # Clean web tags and artifacts
    print("Cleaning text...")
    df['clean_text'] = df['full_text'].apply(soup)
    
    # Drop columns, sort by index to prepare for encode
    df = df[['id','subreddit','clean_text']]
    df = df.sort_index()
    return df

In [4]:
def encode_labels(column):
    """Encodes target labels and returns a dictionary
    Param: Dataframe column
        Example: df['column']
    Outputs: labels_encoded, name_mapping
        labels_encoded is an np.ndarray that can be appended to dataframe
        name_mapping is a matched dictionary of the generated encodings to labels
    """
    # Instantiate encoder
    print("Encoding Labels...")
    encoder = LabelEncoder()
    labels_encoded = encoder.fit_transform(column)
    
    # Make dictionary of labels
    name_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
    
    return labels_encoded, name_mapping

In [5]:
def new_datasets(df, labels_encoded):
    """Applies label encodings and returns 3 datasets formatted for BERT
    Params: df, the dataframe to append label encodings to
        labels_encoded, the encodings
    Output: train, dev, test 
        Three dataframes
    
    """
    df['labels'] = labels_encoded
    df['SPACE'] = 'a'
    df = df[['id','labels','SPACE','clean_text']]
    
    # Need a train, validate, test split for BERT
    train, dev = train_test_split(df, test_size=0.15, stratify=df["labels"])
    train, test = train_test_split(train, test_size=0.15, stratify=train["labels"])
    
    # Fix test
    test = test[['id','clean_text']]
    test = test.rename(columns={"id": "guid", "clean_text": "text"})
    
    print("Datasets Created.")

    return train, dev, test

In [6]:
def main():
    df = clean_dataset('reddit_posts.tsv')
    labels_encoded, name_mapping = encode_labels(df['subreddit'])
    train, dev, test = new_datasets(df, labels_encoded)
    
    #output tsv file, no header for train and dev
    train.to_csv('dataset/train.tsv', sep='\t', index=False, header=False)
    dev.to_csv('dataset/dev.tsv', sep='\t', index=False, header=False)
    test.to_csv('dataset/test.tsv', sep='\t', index=False, header=True)

In [7]:
main()

Original Size: (1013000, 4)
Sampled Size: (101300, 5)
Cleaning text...
Encoding Labels...
Datasets Created.
