# Extracting the features from the csv file containing all extracted raw posts

#### This script takes the csv file containing all posts with train/test label and creates another CSV with the 'normal' features

### Imports

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import json
import os
from IPython.display import clear_output
import numpy as np
import string
import math
import nltk
from collections import defaultdict

### Definitions

In [2]:
raw_csv_filename = 'data/PostsList.csv'
raw_nonulls_csv_filename = 'Data/PostList-NoNulls.csv'

pos_ngram_list_filename = 'data/POS-ngram-selected.csv'
bleached_ngram_list_filename = 'data/Bleached-ngram-selected.csv'

pos_csv_filename = 'data/POS.csv'
bleached_csv_filename = 'data/Bleached.csv'

functionWord_list_filename = 'data/FunctionWords-selected.txt'

# output of this notebook
feature_csv_filename = 'data/MachineLearningData.csv'

punctuation = string.punctuation + ' '
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ '

### As a preprocessing step, we will remove nulls from the PostLists.csv file

In [3]:
try:
    # check if file exists
    f_ = open(raw_nonulls_csv_filename)
    f_.close()
except IOError:
    number_of_posts=0
    chunksize=10
    for chunk in pd.read_csv(raw_csv_filename, usecols=['PostID'], chunksize=chunksize):
        number_of_posts = np.max(chunk.PostID)

    number_of_chunks = math.ceil(number_of_posts / chunksize)

    # Iitialise CSV
    df = pd.DataFrame(columns=['PostID', 'UserID', 'Gender', 'Post', 'TrainTest'])
    df.to_csv(raw_nonulls_csv_filename, index=False)

    # open CSV file in append mode
    with open(raw_nonulls_csv_filename, 'a') as f:
        for i,chunk in enumerate(pd.read_csv(raw_csv_filename, chunksize=chunksize)):
            chunk = chunk.dropna()

            if i%100 == 0:
                clear_output()
                print('{0}/{1}'.format(i+1, number_of_chunks), end = '', flush=True)

            chunk.to_csv(f, header=False, index=False)

    clear_output()
    print('DONE - {0} number_of_posts'.format(number_of_posts))

### Define helper functions

In [4]:
def split(string, delimeter_string):
    first_delim = delimeter_string[0]
    for current_delim in delimeter_string[1:]:
        string = string.replace(current_delim, first_delim)
    splits = string.split(first_delim)
    
    ret_splits = []
    for string in splits:
        if len(string) > 0:
            ret_splits.append(string)
    
    return ret_splits

### Define the required columns as well get the lists from the data CSVs

In [5]:
other_columns = [
    #ID to identify the post
    'PostID',
    
    # Basic Features
    'WordCount',
    'SentenceCount',
    'AvgWordLength',
    'AvgSentenceLength',
    'UniqueWordsPercentage',
    'URLCount',
    
    # Words ending with some suffix
    'ableWords',
    'alWords',
    'fulWords',
    'ibleWords',
    'icWords',
    'iveWords',
    'lessWords',
    'lyWords',
    'ousWords',
    
    # Target value
    'Gender',
    
    # Whether it is part of the train or test set
    'TrainTest'
]

# get pos bi-gram feature list
pos_dict = {} # dictionary mapping pos to column name
df = pd.read_csv(pos_ngram_list_filename)
for row in df.iterrows():
    pos_dict[ (row[1][0], row[1][1]) ] = 'pos_{0}_{1}'.format(row[1][0], row[1][1])
pos_columns = list(pos_dict.values())

# get bleached bi-gram feature list
bleached_dict = {} # dictionary mapping pos to column name
df = pd.read_csv(bleached_ngram_list_filename)
for row in df.iterrows():
    bleached_dict[ (row[1][0], row[1][1]) ] = 'bleached_{0}_{1}'.format(row[1][0], row[1][1])
bleached_columns = list(bleached_dict.values())

# get function word feature list
fw_dict = {}
with open (functionWord_list_filename, 'r') as f:
    words = f.readlines()
for word in words:
    word = word[:-1] # remove '\n' at the end
    fw_dict[word] = 'fw_{0}'.format(word)
fw_columns = list(fw_dict.values())

df = pd.DataFrame(columns=other_columns + pos_columns + bleached_columns + fw_columns)
df.to_csv(feature_csv_filename, index=False)

print(len(df.columns.values))
print(df.columns.values)

168
['PostID' 'WordCount' 'SentenceCount' 'AvgWordLength' 'AvgSentenceLength'
 'UniqueWordsPercentage' 'URLCount' 'ableWords' 'alWords' 'fulWords'
 'ibleWords' 'icWords' 'iveWords' 'lessWords' 'lyWords' 'ousWords'
 'Gender' 'TrainTest' 'pos_VB_VBP' 'pos_._.' 'pos_NNP_NNPS' 'pos_JJ_VBP'
 'pos_NNP_``' 'pos_NNP_TO' 'pos_DT_NNP' 'pos_NNP_MD' 'pos_NNP_)' 'pos_._)'
 'pos_CD_NNP' 'pos_._NN' 'pos_NNP_NNS' 'pos_)_PRP' 'pos_NN_VBP'
 'pos_PRP_CC' 'pos_(_NNP' 'pos_``_DT' 'pos_NNP_(' 'pos_PRP_WRB'
 'pos_NNP_VBZ' 'pos_POS_NNP' 'pos_(_CD' 'pos_``_NNP' 'pos_IN_NNP'
 'pos_NNS_VBZ' 'pos_NNP_CD' 'pos_._VBN' 'pos_JJ_PRP' 'pos_VBG_NNP'
 'pos_CD_)' 'pos_CD_,' 'pos_VBP_VBP' 'pos_NNP_DT' 'pos_VBN_VBN' 'pos_._('
 'pos_NNP_IN' 'pos_NNP_:' 'pos_CD_:' 'pos_:_NNP' 'pos_VBP_PRP'
 'pos_VBP_WP' 'pos_PRP_PRP' 'pos_PRP_NN' 'pos_VBP_WRB' 'pos_PRP_.'
 'pos_WP_NN' 'pos_DT_``' 'pos_PRP_VB' 'pos_VBN_DT' 'bleached_CVV_?'
 'bleached_VCCCVCC_CVCCVC' 'bleached_--_--' 'bleached_VCCCVCC_CVCCVCC'
 'bleached_!_V' 'bleached_VCCCVCC_

### Define main functions to parse a post and return the entry

In [6]:
def extract_data_raw(post):
    words = split(post, punctuation)
    sentences = split(post, '.?!')

    entry = {
        'WordCount': getWordCount(words),
        'SentenceCount': getSentenceCount(sentences),
        'AvgWordLength': getAvgWordLength(words),
        'AvgSentenceLength': getAvgSentenceLength(sentences),
        'UniqueWordsPercentage': UniqueWordsPercentage(words),
        'URLCount': getURLCount(words),
    }
    entry.update( getWordsEndingDict(words) )

    entry.update( getFunctionWordFrequencies(post) )
    
    return entry

def extract_data_pos(post):
    entry = dict.fromkeys( pos_columns , 0)
    
    if len(list(nltk.bigrams(nltk.word_tokenize(post)))) > 0:
        inverse_number_of_bigrams = 1/len(list(nltk.bigrams(nltk.word_tokenize(post))))
        
        bigrams = nltk.bigrams(nltk.word_tokenize(post))
        for bigram in bigrams:
            if bigram in pos_dict:
                entry[ pos_dict[bigram] ] += inverse_number_of_bigrams  
    return entry

def extract_data_bleached(post):
    entry = dict.fromkeys( bleached_columns , 0)
    
    if len(list(nltk.bigrams(nltk.word_tokenize(post)))) > 0:
        inverse_number_of_bigrams = 1/len(list(nltk.bigrams(nltk.word_tokenize(post))))
        
        bigrams = nltk.bigrams(nltk.word_tokenize(post))
        for bigram in bigrams:
            if bigram in bleached_dict:
                entry[ bleached_dict[bigram] ] += inverse_number_of_bigrams
    return entry

### Define functions to parse blog posts individually

In [7]:
def getWordCount(words):
    return len(words)

def getSentenceCount(sentences):
    return len(sentences)

def getAvgWordLength(words):
    return np.mean( np.asarray(list( map(len, words) )) )

def getAvgSentenceLength(sentences):
    sentences = [split(s,punctuation) for s in sentences]
    return np.mean( np.asarray(list( map(len, sentences) )) )

def UniqueWordsPercentage(words):
    if len(words) == 0: return 0
    return len( np.unique(words) ) / len(words)

def getURLCount(words):
    return words.count('urlLink')/len(words)

def getWordsEndingDict(words):
    no_of_words = len(words)
    return {
        'ableWords': get_no_of_words_ending_with('able', words)/no_of_words,
        'alWords': get_no_of_words_ending_with('al', words)/no_of_words,
        'fulWords': get_no_of_words_ending_with('ful', words)/no_of_words,
        'ibleWords': get_no_of_words_ending_with('ible', words)/no_of_words,
        'icWords': get_no_of_words_ending_with('ic', words)/no_of_words,
        'iveWords': get_no_of_words_ending_with('ive', words)/no_of_words,
        'lessWords': get_no_of_words_ending_with('less', words)/no_of_words,
        'lyWords': get_no_of_words_ending_with('ly', words)/no_of_words,
        'ousWords': get_no_of_words_ending_with('ous', words)/no_of_words
    }

def get_no_of_words_ending_with(suffix, words):
    count = 0
    for word in words:
        if word.endswith(suffix):
            count += 1
    return count

def getFunctionWordFrequencies(post):
    entry = dict.fromkeys( fw_columns , 0)
    
    words = nltk.word_tokenize(post)
    if len(words) > 0:
        inverse_number_of_words = 1/len(words)
        
        for word in words:
            if word in entry:
                entry[word] += inverse_number_of_words
    return entry

### Extract the features

In [None]:
number_of_posts=0
chunksize=10
for chunk in pd.read_csv(raw_nonulls_csv_filename, usecols=['PostID'], chunksize=chunksize):
    number_of_posts = np.max(chunk.PostID)

number_of_chunks = math.ceil(number_of_posts / chunksize)

In [None]:
# open CSV file in append mode
with open(feature_csv_filename, 'a') as f:
    for i, ( raw_chunk, pos_chunk, bleached_chunk) in enumerate(zip(
        pd.read_csv(raw_nonulls_csv_filename,chunksize=chunksize),
        pd.read_csv(pos_csv_filename, chunksize=chunksize),
        pd.read_csv(bleached_csv_filename, chunksize=chunksize)
    )):
        
        clear_output()
        print('{0}/{1}'.format(i+1, number_of_chunks), end = '', flush=True)
        
        # Extract data from each file
        df = df.iloc[0:0]
        for (raw_row, pos_row, bleached_row) in zip(
            raw_chunk.iterrows(),
            pos_chunk.iterrows(),
            bleached_chunk.iterrows()
        ):
            if raw_row[1][0] != pos_row[1][0] or raw_row[1][0] != bleached_row[1][0]:
                raise Exception('Error in data, PostIDs are not equal at row {0}'.format(i))
                
            entry = {
                'PostID': raw_row[1][0],
                'Gender': raw_row[1][2],
                'TrainTest': raw_row[1][4]
            }
            entry.update( extract_data_raw(raw_row[1][3]) )
            entry.update( extract_data_pos(pos_row[1][1]) )
            entry.update( extract_data_bleached(bleached_row[1][1]) )
            
            df = df.append(entry, ignore_index=True)

        df.to_csv(f, header=False, index=False)
        
clear_output()
print('DONE - {0} number_of_posts'.format(number_of_posts))