# 4. Data Transformations - POS Tags and Bleaching.ipynb

#### This script takes the raw data csv file and transforms the post to their respective POS tags as well s bleached format

The Consonant vowel bleaching method is used because it will keep the length of the words as well

### Imports and downloads

In [1]:
import pandas as pd
import nltk
from collections import defaultdict
import numpy as np
import math
from IPython.display import clear_output

nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/danielcauchi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

### Definitions

In [2]:
raw_csv_filename = 'data/PostsList.csv'

pos_filename = 'data/POS.csv'
bleaching_filename = 'data/Bleached.csv'

dataframe_columns = ['PostID', 'Post']

### Initialise dataframes

In [3]:
df_pos = pd.DataFrame(columns=dataframe_columns)
df_pos.to_csv(pos_filename, index=False)

df_bleach = pd.DataFrame(columns=dataframe_columns)
df_bleach.to_csv(bleaching_filename, index=False)

### Define function to return POS tag sentence given a post

In [4]:
def get_pos(post):
    tags = nltk.pos_tag(nltk.word_tokenize(post))
    return ' '.join([t[1] for t in tags])

bleach_dic = defaultdict(int)
bleach_dic['a'] = bleach_dic['A'] = 'V'
bleach_dic['b'] = bleach_dic['B'] = 'C'
bleach_dic['c'] = bleach_dic['C'] = 'C'
bleach_dic['d'] = bleach_dic['D'] = 'C'
bleach_dic['e'] = bleach_dic['E'] = 'V'
bleach_dic['f'] = bleach_dic['F'] = 'C'
bleach_dic['g'] = bleach_dic['G'] = 'C'
bleach_dic['h'] = bleach_dic['H'] = 'C'
bleach_dic['i'] = bleach_dic['I'] = 'V'
bleach_dic['j'] = bleach_dic['J'] = 'C'
bleach_dic['k'] = bleach_dic['K'] = 'C'
bleach_dic['l'] = bleach_dic['L'] = 'C'
bleach_dic['m'] = bleach_dic['M'] = 'C'
bleach_dic['n'] = bleach_dic['N'] = 'C'
bleach_dic['o'] = bleach_dic['O'] = 'V'
bleach_dic['p'] = bleach_dic['P'] = 'C'
bleach_dic['q'] = bleach_dic['Q'] = 'C'
bleach_dic['r'] = bleach_dic['R'] = 'C'
bleach_dic['s'] = bleach_dic['S'] = 'C'
bleach_dic['t'] = bleach_dic['T'] = 'C'
bleach_dic['u'] = bleach_dic['U'] = 'V'
bleach_dic['v'] = bleach_dic['V'] = 'C'
bleach_dic['w'] = bleach_dic['W'] = 'C'
bleach_dic['x'] = bleach_dic['X'] = 'C'
bleach_dic['y'] = bleach_dic['Y'] = 'C'
bleach_dic['z'] = bleach_dic['Z'] = 'C'

def bleach_character(c):
    if bleach_dic[c] == 0:
        return c
    return bleach_dic[c]

def bleach(post):
    result=len(post)*[None]
    for i,c in enumerate(post):
        result[i] = bleach_character(c)
    return ''.join(result)

### Open, traverse and convert the posts in the raw data CSV file to POS tags

In [5]:
chunksize = 50

number_of_posts = sum(1 for line in open(raw_csv_filename)) -1

number_of_chunks = math.ceil(number_of_posts / chunksize)

In [6]:
with open(pos_filename, 'a') as posF, open(bleaching_filename, 'a') as bleachF:
    for i,chunk in enumerate(pd.read_csv(raw_csv_filename, chunksize=chunksize)):
        chunk = chunk.dropna()
        
        clear_output()
        print('{0}/{1}'.format((i+1), number_of_chunks), end = '', flush=True)

        # Extract data from each file
        df_pos = df_pos.iloc[0:0]
        df_bleach = df_bleach.iloc[0:0]
        for row in chunk.iterrows():
            entry_pos = {
                'PostID': row[1][0],
                'Post': get_pos(row[1][3])
            }
            entry_bleach = {
                'PostID': row[1][0],
                'Post': bleach(row[1][3])
            }
            
            df_pos = df_pos.append(entry_pos, ignore_index=True)
            df_bleach = df_bleach.append(entry_bleach, ignore_index=True)
            
        df_pos.to_csv(posF, header=False, index=False)
        df_bleach.to_csv(bleachF, header=False, index=False)

56/13838

TypeError: expected string or bytes-like object