# Extracting the features from the csv file containing all extracted raw posts

#### This script takes the csv file containing all posts with train/test label and creates another CSV with the 'normal' features

### Imports

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import json
import os
from IPython.display import clear_output
import numpy as np
import string
import math

### Definitions

In [2]:
punctuation = string.punctuation + ' '

raw_csv_filename = 'data/PostsList.csv'
feature_csv_filename = 'data/ExtractedData-Basic.csv'

punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ '

### Define helper functions

In [3]:
def split(string, delimeter_string):
    first_delim = delimeter_string[0]
    for current_delim in delimeter_string[1:]:
        string = string.replace(current_delim, first_delim)
    splits = string.split(first_delim)
    
    ret_splits = []
    for string in splits:
        if len(string) > 0:
            ret_splits.append(string)
    
    return ret_splits

def get_no_of_words_ending_with(suffix, words):
    count = 0
    for word in words:
        if word.endswith(suffix):
            count += 1
    return count

### Initialise CSV file with some basic columns

In [4]:
dataframe_columns = [
    #ID to identify the post
    'PostID',
    
    # Basic Features
    'WordCount',
    'SentenceCount',
    'AvgWordLength',
    'AvgSentenceLength',
    'UniqueWordsPercentage',
    'URLCount',
    
    # Words ending with some suffix
    'ableWords',
    'alWords',
    'fulWords',
    'ibleWords',
    'icWords',
    'iveWords',
    'lessWords',
    'lyWords',
    'ousWords',
    
    # Target value
    'Gender',
    
    # Whether it is part of the train or test set
    'TrainTest'
]

df = pd.DataFrame(columns=dataframe_columns)
df.to_csv(feature_csv_filename, index=False)

### Define main functions to parse a post and return the entry

In [5]:
def extract_data(post):    
    df = pd.DataFrame(columns=dataframe_columns)
    
    words = split(post, punctuation)
    sentences = split(post, '.?!')

    entry = {
        'WordCount': getWordCount(words),
        'SentenceCount': getSentenceCount(sentences),
        'AvgWordLength': getAvgWordLength(words),
        'AvgSentenceLength': getAvgSentenceLength(sentences),
        'UniqueWordsPercentage': UniqueWordsPercentage(words),
        'URLCount': getURLCount(words),
    }
    entry.update( getWordsEndingDict(words) )

    return entry

### Define functions to parse blog posts individually

In [6]:
def getWordCount(words):
    return len(words)

def getSentenceCount(sentences):
    return len(sentences)

def getAvgWordLength(words):
    return np.mean( np.asarray(list( map(len, words) )) )

def getAvgSentenceLength(sentences):
    sentences = [split(s,punctuation) for s in sentences]
    return np.mean( np.asarray(list( map(len, sentences) )) )

def UniqueWordsPercentage(words):
    if len(words) == 0: return 0
    return len( np.unique(words) ) / len(words)

def getURLCount(words):
    return words.count('urlLink')/len(words)

def getWordsEndingDict(words):
    no_of_words = len(words)
    return {
        'ableWords': get_no_of_words_ending_with('able', words)/no_of_words,
        'alWords': get_no_of_words_ending_with('al', words)/no_of_words,
        'fulWords': get_no_of_words_ending_with('ful', words)/no_of_words,
        'ibleWords': get_no_of_words_ending_with('ible', words)/no_of_words,
        'icWords': get_no_of_words_ending_with('ic', words)/no_of_words,
        'iveWords': get_no_of_words_ending_with('ive', words)/no_of_words,
        'lessWords': get_no_of_words_ending_with('less', words)/no_of_words,
        'lyWords': get_no_of_words_ending_with('ly', words)/no_of_words,
        'ousWords': get_no_of_words_ending_with('ous', words)/no_of_words
    }

### Extract the features

In [7]:
number_of_posts=0
chunksize=10
for chunk in pd.read_csv(raw_csv_filename, usecols=['PostID'], chunksize=chunksize):
    number_of_posts = np.max(chunk.PostID)

number_of_chunks = math.ceil(number_of_posts / chunksize)

In [8]:
# open CSV file in append mode
with open(feature_csv_filename, 'a') as f:
    for i,chunk in enumerate(pd.read_csv(raw_csv_filename, chunksize=chunksize)):
        chunk = chunk.dropna()
        
        clear_output()
        print('{0}/{1}'.format(i+1, number_of_chunks), end = '', flush=True)
        
        # Extract data from each file
        df = df.iloc[0:0]
        for row in chunk.iterrows():
            entry = {
                'PostID': row[1][0],
                'Gender': row[1][2],
                'TrainTest': row[1][4]
            }
            entry.update( extract_data(row[1][3]) )
            df = df.append(entry, ignore_index=True)
            
        df.to_csv(f, header=False, index=False)

38/3864

KeyboardInterrupt: 