# 1. Feature Extraction 1 - Raw Data

#### This script reads all the blog files and puts them in one CSV file containing the following columns:
   * PostID - As an index of sorts
   * UserID - To keep track for when there is the need of splitting between trainign and testing
   * Gender - Target value
   * Post - The raw text of the post, trimmed of spaces from the front or after

### Imports

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import os
from IPython.display import clear_output

### Definitions

In [2]:
data_path = 'data/blogs/'

csv_filename = 'data/PostsList.csv'

train_test_df = pd.read_csv('data/AuthorTrainTest.csv', index_col=['UserID'])

### Initialise dataframe and target CSV filename

In [3]:
df = pd.DataFrame(columns=['PostID', 'UserID', 'Gender', 'Post', 'TrainTest'])
df.to_csv(csv_filename, index=False)

### Function to extract and clean the posts obtained

In [4]:
def extract_posts(filename):
    global PostID, df
    df=df.iloc[0:0]
    with open(data_path + filename, "rb", encoding=None) as f:
        contents = f.read().decode('utf8', 'ignore')
        
    soup = BeautifulSoup(contents, 'html')
    
    meta = filename.split('.')
    UserID = int(meta[0])
    Gender = meta[1]
    
    TrainTest = train_test_df.loc[UserID]['TrainTest']
    
    for post in soup.findAll('post'):
        Post = post.text.strip()
        # New lines are removed and will not be considered.
        # Instead they are replaced by fullstops since this will work well for our processing
        # Some users were shown to not end their sentences with
        # fullstops before a newline, so we do it for them
        Post = Post.replace('\r\n','.').replace('\n','.')
        
        entry = {
            'PostID': PostID,
            'UserID': UserID,
            'Gender': Gender,
            'Post': Post,
            'TrainTest': TrainTest
        }
    
        df = df.append(entry, ignore_index=True)

        PostID +=1
        
    return df

### Populate the CSV file

In [5]:
# Iterate over all the xml files and get number of files (to see how much is left)
number_of_files = len(os.listdir(data_path))
    
PostID = 0
# open CSV file in append mode
with open(csv_filename, 'a') as f:
    for i, filename in enumerate(os.listdir(data_path)):
        clear_output()
        print('{0}/{1}'.format((i+1), number_of_files), end = '', flush=True)
        
        # Extract data from each file
        df = extract_posts(filename)
        df.to_csv(f, header=False, index=False)

clear_output()
print('DONE - {0} files'.format(number_of_files))

DONE - 19320 files
