# Fetch input and parse

In [18]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

The below code fetches the datasets, extracts the xml for each author, and returns two Python dictionaries:
* author_tweets - maps each authors id to a list of their tweets.
* author_truths - maps each authors id to a truth value 0 or 1.

In [20]:
def __parse_author_tweets(xml_filepaths):
    """Returns a dictionary of authors to a list of their tweets"""
    author_tweets = {}
    for filepath in xml_filepaths:
        xml_tree = ET.parse(filepath)
        documents = xml_tree.getroot()[0]
        file_path_components = filepath.split("\\")
        file = file_path_components[len(file_path_components)-1]
        
        author = file[0:len(file)-4]
        tweets = [document.text for document in documents]
        author_tweets[author] = tweets
    
    return author_tweets

def __parse_author_truths(truth_filepath):
    """Returns a dictionary of authors to their truth values 1/0"""
    author_truths = {}
    with open(truth_filepath, 'r') as fp:
        line = fp.readline()
        while line:
            author, truth = line.rstrip().split(":::")
            author_truths[author] = truth
            line = fp.readline()
    
    return author_truths

def __filter_files(datasets_path, files, file_type):
    filtered = filter(lambda f: f.endswith(file_type), files)
    return list(map(lambda f: os.path.join(datasets_path, f), filtered))

def parse_datasets_language(datasets_path, language):
    """
    Keyword arguments:
    datasets_path -- path to the datasets directory
    language -- the language dataset to use, either "en" or "es"
    
    Returns pandas DataFrame, where each row contains an author id, truth value, and tweets 1 to 100.
    """
    language_path = os.path.join(datasets_path, language)
    
    # Get each file in the directory and filter by .xml and .txt extensions.
    files = os.listdir(language_path)
    xml_filepaths = __filter_files(language_path, files, ".xml")
    truth_filepath = __filter_files(language_path, files, ".txt")[0]
    
    # Parse the files.
    author_tweets = __parse_author_tweets(xml_filepaths)
    author_truths = __parse_author_truths(truth_filepath)
    
    # Convert to a pandas DataFrame
    data = []
    for key, value in author_tweets.items():
        d = {}
        d["author_id"] = key
        d["truth_value"] = author_truths[key]
        for i, tweet in enumerate(value, start=1):
            d["tweet_"+str(i)] = tweet
        
        data.append(d)
    
    return pd.DataFrame(data)