# Extracting the features from blog files

#### This script takes each blog file and creates a CSV with the 'normal' features and a JSON with the list of features to be handled by the RNN part only

### Imports

In [147]:
import nltk
from bs4 import BeautifulSoup
import pandas as pd
import json
import os
from IPython.display import clear_output
import numpy as np
import string

### Definitions

In [163]:
data_path = 'data/blogs/'

punctuation = string.punctuation + ' '

PostID = 0 # A global parameter to give unique IDs to posts

### Define helper functions

In [164]:
def split(string, delimeter_string):
    first_delim = delimeter_string[0]
    for current_delim in delimeter_string[1:]:
        string = string.replace(current_delim, first_delim)
    splits = string.split(first_delim)
    
    ret_splits = []
    for string in splits:
        if len(string) > 0:
            ret_splits.append(string)
    
    return ret_splits

def get_no_of_words_ending_with(suffix, words):
    count = 0
    for word in words:
        if word.endswith(suffix):
            count += 1
    return count

### Initialise CSV file with some basic columns

In [165]:
csv_filename = 'data/ExtractedData.csv'

dataframe_columns = [
    #ID to identify the post
    'PostID',
    
    # Basic Features
    'WordCount',
    'SentenceCount',
    'AvgWordLength',
    'AvgSentenceLength',
    'UniqueWordsPercentage',
    'URLCount',
    
    # Words ending with some suffix
    'ableWords',
    'alWords',
    'fulWords',
    'ibleWords',
    'icWords',
    'iveWords',
    'lessWords',
    'lyWords',
    'ousWords',
    
    # Target value
    'Gender' 
]

df = pd.DataFrame(columns=dataframe_columns)
df.to_csv(csv_filename, index=False)

### Define main functions to parse a blog file and return a new full dataframe

In [167]:
def extract_data(filename):
    global PostID
    
    with open(data_path + filename, "rb", encoding=None) as f:
        contents = f.read().decode('utf8', 'ignore')
        
    soup = BeautifulSoup(contents, 'html')
    
    meta = filename.split('.')
    gender = meta[1]
    
    df = pd.DataFrame(columns=dataframe_columns)
    
    for post in soup.findAll('post'):
        post = post.text.strip()
        words = split(post, punctuation)
        sentences = post.split('.')
        
        entry = {
            'PostID': PostID,
            
            'WordCount': getWordCount(words),
            'SentenceCount': getSentenceCount(sentences),
            'AvgWordLength': getAvgWordLength(words),
            'AvgSentenceLength': getAvgSentenceLength(sentences),
            'UniqueWordsPercentage': UniqueWordsPercentage(words),
            'URLCount': getURLCount(words),

            'Gender': gender
        }
        
        entry.update( getWordsEndingDict(words) )
        df = df.append(entry, ignore_index=True)
        
        PostID +=1
        
    return df

### Define functions to parse blog posts individually

In [168]:
def getWordCount(words):
    return len(words)

def getSentenceCount(sentences):
    return len(sentences)

def getAvgWordLength(words):
    return np.mean( np.asarray(list( map(len, words) )) )

def getAvgSentenceLength(sentences):
    sentences = [split(s,punctuation) for s in sentences]
    return np.mean( np.asarray(list( map(len, sentences) )) )

def UniqueWordsPercentage(words):
    if len(words) == 0: return 0
    return len( np.unique(words) ) / len(words)

def getURLCount(words):
    return words.count('urlLink')

def getWordsEndingDict(words):
    return {
        'ableWords': get_no_of_words_ending_with('able', words),
        'alWords': get_no_of_words_ending_with('al', words),
        'fulWords': get_no_of_words_ending_with('ful', words),
        'ibleWords': get_no_of_words_ending_with('ible', words),
        'icWords': get_no_of_words_ending_with('ic', words),
        'iveWords': get_no_of_words_ending_with('ive', words),
        'lessWords': get_no_of_words_ending_with('less', words),
        'lyWords': get_no_of_words_ending_with('ly', words),
        'ousWords': get_no_of_words_ending_with('ous', words)
    }

### Extract the features

In [169]:
# Iterate over all the html files and get number of files (to see how much is left)
number_of_files = 0
for filename in os.listdir('data/blogs'):
    number_of_files += 1
    
PostID = 0
# open CSV file in append mode
with open(csv_filename, 'a') as f:
    for i, filename in enumerate(os.listdir('data/blogs')):
        clear_output()
        print('{0}/{1}'.format(i, number_of_files), end = '', flush=True)
        
        # Extract data from each file
        df = extract_data(filename)
        df.to_csv(f, header=False, index=False)

35/19320['There', 'is', 'a', 'fake', 'commercial', 'on', 'Nickelodeon', 'about', 'Scream', 'in', 'a', 'Box', 'as', 'a', 'device', 'to', 'keep', 'you', 'awake', 'and', 'give', 'you', 'more', 'time', 'to', 'do', 'things', 'Maybe', 'I', 'need', 'that', 'I', 'waste', 'too', 'much', 'time', 'on', 'stuff', 'like', 'Neopets', 'brutal', 'honesty', 'here', 'I', 'm', 'behind', 'transcribing', 'my', 'notes', 'from', 'my', 'Exploring', 'the', 'Visual', 'Arts', 'class', 'I', 've', 'been', 'putting', 'them', 'online', 'so', 'I', 'can', 'include', 'links', 'that', 'I', 've', 'looked', 'up', 'I', 'don', 't', 'know', 'if', 'this', 'is', 'all', 'a', 'waste', 'of', 'time', 'or', 'what', 'Anyway', 'I', 'was', 'up', 'til', 'about', '2', '30', 'this', 'morning', 'and', 'didn', 't', 'finish', 'Monday', 's', 'notes', 'When', 'I', 'came', 'home', 'from', 'picking', 'the', 'girls', 'up', 'from', 'school', 'I', 'was', 'hoping', 'to', 'do', 'today', 's', 'notes', 'Well', 'I', 'don', 't', 'have', 'time', 'I', 'll'

['It', 's', 'nearly', '2', 'a', 'm', 'I', 'haven', 't', 'started', 'on', 'my', 'project', 'for', 'my', 'Advanced', 'Photoshop', 'class', 'because', 'apparently', 'I', 'didn', 't', 'really', 'upload', 'it', 'to', 'my', 'Yahoo', 'briefcase', 'like', 'I', 'thought', 'and', 'I', 'm', 'watching', 'House', 'Hunters', 'on', 'urlLink', 'HGTV', 'This', 'is', 'a', 'semi', 'interesting', 'show', 'you', 'get', 'to', 'see', 'other', 'people', 'look', 'at', 'homes', 'to', 'buy', 'The', 'couple', 'in', 'this', 'episode', 'Alicia', 'and', 'Gentry', 'Patrick', 'have', 'my', 'vote', 'for', 'most', 'overuse', 'of', 'the', 'word', 'nice', 'It', 's', 'starting', 'to', 'get', 'annoying', 'Tomorrow', 'actually', 'later', 'today', 'I', 'have', 'to', 'turn', 'in', 'my', 'days', 'off', 'request', 'for', 'March', 'my', 'timesheet', 'to', 'the', 'college', 'for', 'note', 'taking', 'and', 'my', 'timesheet', 'at', 'work', 'It', 's', 'that', 'time', 'already', 'I', 'was', 'playing', 'on', 'urlLink', 'Neopets', 'but'

['I', 'added', 'a', 'cast', 'of', 'characters', 'to', 'my', 'blog', 'so', 'people', 'can', 'figure', 'out', 'who', 'the', 'heck', 'I', 'm', 'talking', 'about', 'that', 'is', 'if', 'anyone', 'is', 'reading', 'this', 'besides', 'me', 'I', 'went', 'to', 'Relief', 'Society', 'tonight', 'Jessica', 'volunteered', 'to', 'help', 'with', 'the', 'nursery', 'again', 'I', 'feel', 'guilty', 'about', 'relying', 'on', 'her', 'all', 'the', 'time', 'but', 'it', 'does', 'get', 'her', 'out', 'of', 'the', 'house', 'I', 'gave', 'her', 'a', 'scrapbooking', 'book', 'and', 'templates', 'that', 'I', 'had', 'It', 's', 'nearly', 'new', 'and', 'the', 'templates', 'were', 'only', 'used', 'once', 'when', 'Jessica', 'was', 'here', 'watching', 'the', 'girls', 'Anyway', 'I', 'think', 'she', 'likes', 'it', 'And', 'I', 'do', 'appreciate', 'all', 'the', 'help', 'she', 'gives', 'me', 'Enrichment', 'Night', 'was', 'about', 'music', 'how', 'it', 'can', 'help', 'in', 'many', 'ways', 'bonding', 'with', 'family', 'growing', 's

KeyboardInterrupt: 