# Imports

In [None]:
import math
import pandas as pd
import numpy as np
import glob
import datetime
from calendar import monthrange
pd.options.mode.chained_assignment = None

%matplotlib inline

# Table of Contents
 <p><div class="lev1"><a href="#Task-1.-Compiling-Ebola-Data"><span class="toc-item-num">Task 1.&nbsp;&nbsp;</span>Compiling Ebola Data</a></div>
 <div class="lev1"><a href="#Task-2.-RNA-Sequences"><span class="toc-item-num">Task 2.&nbsp;&nbsp;</span>RNA Sequences</a></div>
 <div class="lev1"><a href="#Task-3.-Class-War-in-Titanic"><span class="toc-item-num">Task 3.&nbsp;&nbsp;</span>Class War in Titanic</a></div></p>

In [None]:
DATA_FOLDER = 'Data' # Use the data folder provided in Tutorial 02 - Intro to Pandas.

## Task 1. Compiling Ebola Data

The `DATA_FOLDER/ebola` folder contains summarized reports of Ebola cases from three countries (Guinea, Liberia and Sierra Leone) during the recent outbreak of the disease in West Africa. For each country, there are daily reports that contain various information about the outbreak in several cities in each country.

Use pandas to import these data files into a single `Dataframe`.
Using this `DataFrame`, calculate for *each country*, the *daily average per month* of *new cases* and *deaths*.
Make sure you handle all the different expressions for *new cases* and *deaths* that are used in the reports.

### Constants

In [None]:
DEFAULT_NAME_COLUMN_DATE = 'Date'
DEFAULT_NAME_COLUMN_DESCRIPTION = 'Description'
DEFAULT_NAME_COLUMN_TOTAL = 'Totals'
DEFAULT_NAME_COLUMN_COUNTRY = 'Country'

DEFAULT_NAME_ROW_DESCRIPTION_NEW_CASES = 'New cases'
DEFAULT_NAME_ROW_DESCRIPTION_NEW_DEATHS = 'New deaths'

DEFAULT_DATA_FORMAT = '%Y-%m-%d'

### Auxiliary functions

In [None]:
def sanitize_date(date, original_formats):
    '''
    Transform a date specified in a string with a certain original
    format into a string with the date in the default format.
    '''
    date_rep = None
    try:
        date_rep = datetime.datetime.strptime(date, original_formats[0])
    except ValueError:
        date_rep = datetime.datetime.strptime(date, original_formats[1])
    return date_rep.strftime(DEFAULT_DATA_FORMAT)

In [None]:
def estimate_daily_from_cum(np_list):
    '''
    Given a list of accumulated data, calculate the data for each day.
    It assumes ordered data.
    '''
    last_known_n = -1
    final_list = list()
    for i, elem in np_list.iteritems():
        #elem = float(elem)
        if (math.isnan(elem)):
            final_list.append(0)
        else:
            if (last_known_n == -1):
                final_list.append(0)
            else:
                final_list.append(elem - last_known_n)
            last_known_n = elem
    return np.array(final_list)

The definitions of the basic schema and formats for the given dataset is stored in the following dictionaries

In [None]:
country_keys = ['guinea', 'liberia', 'sl']
paths = {x: DATA_FOLDER + '/ebola/'+ x + '_data' for x in country_keys}
countries = {'guinea': 'Guinea', 'liberia': 'Liberia', 'sl': 'Sierra Leone'}

wanted_columns = {'guinea': ['Date', 'Description', 'Totals'],
                      'liberia': ['Date', 'Variable', 'National'],
                      'sl': ['date', 'variable', 'National']}

date_original_formats = {'guinea': ['%Y-%m-%d', '%y-%m-%d'],
                         'liberia': ['%m/%d/%Y', '%m/%d/%y'],
                         'sl': ['%Y-%m-%d', '%y-%m-%d']}
dataframe_list = {}

### Read files

In [None]:
for i in country_keys:
    temp_list = []
    all_files = glob.glob(paths[i] + '/*.csv')
    for file in all_files:
        temp_df = pd.read_csv(file)
        temp_list.append(temp_df)
    dataframe_list[i] = pd.concat(temp_list)
    
    # Format dates and order dataframes by date
    dataframe_list[i][wanted_columns[i][0]] = np.array([sanitize_date(x, date_original_formats[i])
                                                        for x in dataframe_list[i][wanted_columns[i][0]]])
    
    dataframe_list[i][wanted_columns[i][0]] = pd.to_datetime(dataframe_list[i][wanted_columns[i][0]])
    dataframe_list[i].sort_values(by=wanted_columns[i][0])

### Parsing functions

The goal of the parsing processes is to obtain a *standard* datataset for each country. We want to extract the useful data for our task while reorganizing the schema and data format to develop a general coherence.

***N.B.*** The resulting datatframe do not have the indexing reformatted. Since those are temporary results that will later be concatenated in a single dataframe, we decided to handle the indexing after this operation.

--

In order to parse the Guinea datatset we considered the variables ```New deaaths registered``` (or ```New deaths registered today``` when it was used instead of the previous one) and ```Total new cases registered so far```

In [None]:
def parse_guinea_files():
    # Select certain column from the CSV formatted files
    parsed_guinea_df = dataframe_list['guinea'][wanted_columns['guinea']]
    wanted_row_list = ['Total new cases registered so far',
                       'New deaths registered', 
                       'New deaths registered today']
    
    # Select all the rows that match desired names for 'Description' column
    parsed_guinea_df = parsed_guinea_df[parsed_guinea_df['Description'].isin(wanted_row_list)]
    
    # Rename every row value for 'Description' column 
    parsed_guinea_df.loc[parsed_guinea_df['Description'] ==
                         wanted_row_list[0], 'Description'] = DEFAULT_NAME_ROW_DESCRIPTION_NEW_CASES
    
    parsed_guinea_df.loc[parsed_guinea_df['Description'] ==
                         wanted_row_list[1], 'Description'] = DEFAULT_NAME_ROW_DESCRIPTION_NEW_DEATHS
    
    parsed_guinea_df.loc[parsed_guinea_df['Description'] ==
                         wanted_row_list[2], 'Description'] = DEFAULT_NAME_ROW_DESCRIPTION_NEW_DEATHS
    
    # Add 'Country' column to dataframe
    parsed_guinea_df[DEFAULT_NAME_COLUMN_COUNTRY] = countries['guinea']
    
    return parsed_guinea_df

Sample:

In [None]:
parse_guinea_files().head()

In order to parse the Liberia dataset we considered the ```Newly reported deaths``` variable. To handle the new cases data we, on the other hand, had to do some manipulations.

The new cases comes in three different variables (```New Case/s (Suspected)```, ```New Case/s (Probable)``` and ```New Case/s (confirmed)```) that have to be summed up to obtain the total daily value.

The data comes with strangely big new casses values for the last days of the report. We concluded that those numbers couldn't be correct. As we found a correlation between those values and th3 total new cases accomulator for the past days, we assumed the data as been inputted on the wrong part of the table. Some missing new cases data were therefore calculated from the daily difference of those totals accumulator during the last month of the report. 

In [None]:
def parse_liberia_files():
    # Select certain columns from the CSV formatted files
    liberia_df = dataframe_list['liberia'][wanted_columns['liberia']]

    wanted_row_list = ['New Case/s (Suspected)',
                       'New Case/s (Probable)',
                       'New case/s (confirmed)',
                       'Newly reported deaths',
                       'Total suspected cases',
                       'Total probable cases',
                       'Total confirmed cases']

    # Create temporary dataframe for new case entries
    # (with every row that matches the desired 'Variables' column value)
    liberia_df_new_cases_cum_1 = liberia_df[liberia_df['Variable'].isin(wanted_row_list[4:7])]
    liberia_df_new_cases_cum_1 = liberia_df_new_cases_cum_1[liberia_df_new_cases_cum_1['Date'].map(
        lambda x: x.month == 12 and x.day <= 3)]
    liberia_df_new_cases_cum_1 = liberia_df_new_cases_cum_1.groupby('Date').sum()
    
    liberia_df_new_cases_cum_2 = liberia_df[liberia_df['Variable'].isin(wanted_row_list[0:3])]
    liberia_df_new_cases_cum_2 = liberia_df_new_cases_cum_2[liberia_df_new_cases_cum_2['Date'].map(
        lambda x: x.month == 12 and x.day > 3)]
    liberia_df_new_cases_cum_2 = liberia_df_new_cases_cum_2.groupby('Date').sum()
    
    liberia_df_new_cases_cum = pd.concat([liberia_df_new_cases_cum_1, liberia_df_new_cases_cum_2])

    
    liberia_df_new_cases_cum['National'] = estimate_daily_from_cum(liberia_df_new_cases_cum['National'])
    
    liberia_df_new_cases = liberia_df[liberia_df['Variable'].isin(wanted_row_list[0:3])]
    liberia_df_new_cases = liberia_df_new_cases[liberia_df_new_cases['Date'].map(
        lambda x: x.month != 12)]
    
    liberia_df_new_cases = pd.concat([liberia_df_new_cases, liberia_df_new_cases_cum])
    
    # Sum all of the values for 'Probable', 'Variable' and 'Confirmed' new cases
    liberia_df_new_cases = liberia_df_new_cases.groupby('Date').sum()

    liberia_df_new_cases['Date'] = liberia_df_new_cases.index
    liberia_df_new_cases['Variable'] = DEFAULT_NAME_ROW_DESCRIPTION_NEW_CASES
    
    # Create temporary dataframe for new deaths entries
    liberia_df_new_deaths = liberia_df[liberia_df['Variable'] == wanted_row_list[3]]
    liberia_df_new_deaths['Variable'] = DEFAULT_NAME_ROW_DESCRIPTION_NEW_DEATHS
    
    # Create new dataframe with new deaths and cases (concatenating the two temporary)
    parsed_liberia_df = pd.concat([liberia_df_new_cases, liberia_df_new_deaths])
    
    parsed_liberia_df.rename(columns={'Date': DEFAULT_NAME_COLUMN_DATE,
                                      'Variable': DEFAULT_NAME_COLUMN_DESCRIPTION,
                                      'National': DEFAULT_NAME_COLUMN_TOTAL}, inplace=True)

    # Add 'Country' column to dataframe
    parsed_liberia_df[DEFAULT_NAME_COLUMN_COUNTRY] = countries['liberia']
    
    return parsed_liberia_df

Sample:

In [None]:
parse_liberia_files().head()

In order to parse the Sierra Leone datatset we followed similar steps as for the Liberia. The new cases value has been computed as the sum of the different variables of the table (```new_noncase```, ```new_suspected```, ```new_probable``` and ```new_confirmed```). 

Since the dataset only had total accomulators for the new death variable we calculated them as the difference between days, as previously done with Liberia. The possible missing data for this operation has been handled by setting them to zero, while keeping the the same delta for known points.

In [None]:
def parse_sierra_files():
    # Select certain columns from the CSV formatted files
    parsed_sierra_df = dataframe_list['sl'][wanted_columns['sl']]
    
    wanted_row_list = ['new_noncase', 'new_suspected', 'new_probable', 'new_confirmed',
                       'death_confirmed', 'death_probable', 'death_suspected']
    
    # Create temporary dataframe for new case entries
    # (with every row that matches the desired 'variables' column value)
    sierra_df_new_cases = parsed_sierra_df[parsed_sierra_df['variable'].isin(wanted_row_list[0:4])]
    
    # Sum all of the values for 'Probable', 'Variable', 'Confirmed' and 'Noncase' new cases
    sierra_df_new_cases = sierra_df_new_cases.groupby('date')['National'].apply(
        lambda x: np.array([float(y) for y in x]).sum()).to_frame()
    
    sierra_df_new_cases['date'] = sierra_df_new_cases.index
    sierra_df_new_cases['variable'] = DEFAULT_NAME_ROW_DESCRIPTION_NEW_CASES
    
    # Create temporary dataframe for new deaths' accumulator entries
    sierra_df_new_deaths_cum = parsed_sierra_df[parsed_sierra_df['variable'].isin(wanted_row_list[4:7])]
    
    # Sum all of the values for 'Probable', 'Suspected' and 'Confirmed' new death accumulators
    sierra_df_new_deaths_cum = sierra_df_new_deaths_cum.groupby('date')['National'].apply(
        lambda x: np.array([float(y) for y in x]).sum()).to_frame()
    
    sierra_df_new_deaths_cum['date'] = sierra_df_new_deaths_cum.index
    sierra_df_new_deaths_cum['variable'] = 'New deaths accumulator'
    
    # 
    sierra_df_new_deaths_cum['National'] = estimate_daily_from_cum(sierra_df_new_deaths_cum['National'])
    sierra_df_new_deaths_cum['variable'] = DEFAULT_NAME_ROW_DESCRIPTION_NEW_DEATHS
    
    #
    parsed_sierra_df = pd.concat([sierra_df_new_cases, sierra_df_new_deaths_cum])
    
    parsed_sierra_df.rename(columns={'date': DEFAULT_NAME_COLUMN_DATE,
                                     'variable': DEFAULT_NAME_COLUMN_DESCRIPTION,
                                     'National': DEFAULT_NAME_COLUMN_TOTAL}, inplace=True)
    
    # Add 'Country' column to dataframe
    parsed_sierra_df[DEFAULT_NAME_COLUMN_COUNTRY] = countries['sl']
    
    return parsed_sierra_df

Sample:

In [None]:
parse_sierra_files().head()

### Joining the parsed country datatset into a single one

In [None]:
complete_df = pd.concat([parse_guinea_files(), parse_liberia_files(), parse_sierra_files()])
complete_df.reset_index(inplace=True, drop=True)

Sample:

In [None]:
complete_df.head()

Even after handling the Nan troubles during the parsing for some aspects, we might still have some to manage

In [None]:
complete_df[complete_df[DEFAULT_NAME_COLUMN_TOTAL].isnull()].head()

### TODO

In [None]:
# Assume NaN values are 0
complete_df = complete_df.fillna(0)
complete_df.head()

We have seen that the difference in the number of deaths between 2014-09-30 and 2014-10-01 is negative. We decided to ignore this negative number of deaths, since it is ....

In [None]:
# TODO: Maybe remove the row with -18 deaths

### Calculate for each country, the daily average per month of new cases and deaths

In [None]:
grouped = complete_df.groupby(by=[complete_df.Country, complete_df.Description, [x.month for x in complete_df.Date]])

In [None]:
calculated_average_df = pd.DataFrame()

for ((country, description, month), values) in grouped:
    days_in_month = monthrange(2014, month)[1]
    entry = {'Country':country, 
             'Description':description, 
             'Month':month, 
             'Average':values[DEFAULT_NAME_COLUMN_TOTAL].apply(lambda x: float(x) / days_in_month).sum()}
    calculated_average_df = calculated_average_df.append([entry])

In [None]:
death_average_df = calculated_average_df[calculated_average_df['Description'] == DEFAULT_NAME_ROW_DESCRIPTION_NEW_DEATHS]
cases_average_df = calculated_average_df[calculated_average_df['Description'] == DEFAULT_NAME_ROW_DESCRIPTION_NEW_CASES]

calculated_average_df = pd.merge(death_average_df, cases_average_df, on=['Country', 'Month'])
calculated_average_df.drop(['Description_x', 'Description_y'], axis=1, inplace=True)
calculated_average_df = calculated_average_df.rename(columns={'Average_x': 'Death monthly average',
                                                              'Average_y': 'Cases monthly average'})

The final result if shown in the following table. For each country we calculated the avarage for the new deaths entries as well as the new cases entries

In [None]:
calculated_average_df

### Plots

We plotted the results for a final visualisation of the obtained data. First by country alone in the following bar charts

In [None]:
for title, group in calculated_average_df.groupby('Country'):
    group.plot(x='Month', title=title, kind="bar", figsize=(15,5))

And lastly as an unique bar chart for all the results in one plot

In [None]:
calculated_average_df.plot.bar(x=['Month','Country'], figsize=(15,5))

# Task 2. RNA Sequences

In the `DATA_FOLDER/microbiome` subdirectory, there are 9 spreadsheets of microbiome data that was acquired from high-throughput RNA sequencing procedures, along with a 10<sup>th</sup> file that describes the content of each. 

Use pandas to import the first 9 spreadsheets into a single `DataFrame`.
Then, add the metadata information from the 10<sup>th</sup> spreadsheet as columns in the combined `DataFrame`.
Make sure that the final `DataFrame` has a unique index and all the `NaN` values have been replaced by the tag `unknown`.

### Read files

We started by importing the nine tables in one single big dataframe

In [None]:
all_files = glob.glob(DATA_FOLDER + '/microbiome' + '/MID*.xls')
raw_df = pd.DataFrame()

for file in all_files:
    file_name = file[file.rfind('/')+1:file.rfind('.')]
    temp_df = pd.read_excel(file, header=None)
    temp_df['BARCODE'] = file_name
    raw_df = raw_df.append(temp_df)

raw_df.head()

We the imported the metadata table in another dataframe

In [None]:
metadata_df = pd.read_excel(DATA_FOLDER+'/microbiome'+'/metadata.xls')
metadata_df

### Merging the datatframe

The final goal is to obtain a single dataframe for the given data. We will need to obtain a table of the following format:

| Description | Group | Tissue | Stool | Other |
|-------------|-------|--------|-------|-------|
|             |       |        |       |       |
|             |       |        |       |       |
|             |       |        |       |       |


We will fill-in the last three collumns with the sample date we have in the given datatframes. The column will be chosen according to what the metadata provides. The same idea will be followed to fill-in the Group collumn with the correct information

We create three dataframe containing the metadata informations for a single Sample type

In [None]:
stool_metadata_df = metadata_df[metadata_df['SAMPLE'] == 'stool']
tissue_metadata_df = metadata_df[metadata_df['SAMPLE'] == 'tissue']
na_metadata_df = metadata_df[metadata_df['SAMPLE'].isnull()]

Sample:

In [None]:
tissue_metadata_df

We create three dataframes for the different samples according to what we have in the metatada dataframes obtained before

In [None]:
raw_df = pd.merge(raw_df, metadata_df[['BARCODE', 'GROUP']], on=['BARCODE'])
raw_df.rename(columns={0: 'Description', 1: 'Sample'}, inplace=True)

stool_df = raw_df[raw_df['BARCODE'].isin(list(stool_metadata_df['BARCODE']))].drop('BARCODE', axis=1)
stool_df.rename(columns={'Sample': 'Stool'}, inplace=True)
              
tissue_df = raw_df[raw_df['BARCODE'].isin(list(tissue_metadata_df['BARCODE']))].drop('BARCODE', axis=1)
tissue_df.rename(columns={'Sample': 'Tissue'}, inplace=True)
              
na_df = raw_df[raw_df['BARCODE'].isin(list(na_metadata_df['BARCODE']))].drop('BARCODE', axis=1)
na_df.rename(columns={'Sample': 'Other'}, inplace=True)

Sample:

In [None]:
stool_df.head()

We merge those three temporary dataframes into a single one

In [None]:
merged_df = pd.merge(stool_df, tissue_df, how='outer', on=['Description', 'GROUP'])
merged_df = pd.merge(merged_df, na_df, how='outer', on=['Description', 'GROUP'])
merged_df.head()

To complete the task, we fill the Nan values of the dataset with the 'unknown' value and we clear the general format of the table

In [None]:
# Fill of the 'unknown' value for NaNs
merged_df.fillna('unknown', inplace=True)

# Final cleaning up of the table schema
merged_df.rename(columns={'GROUP':'Group'}, inplace=True)

merged_df.head()

## Task 3. Class War in Titanic

Use pandas to import the data file `Data/titanic.xls`. It contains data on all the passengers that travelled on the Titanic.

In [None]:
from IPython.core.display import HTML
HTML(filename=DATA_FOLDER+'/titanic.html')

For each of the following questions state clearly your assumptions and discuss your findings:
1. Describe the *type* and the *value range* of each attribute. Indicate and transform the attributes that can be `Categorical`. 
2. Plot histograms for the *travel class*, *embarkation port*, *sex* and *age* attributes. For the latter one, use *discrete decade intervals*. 
3. Calculate the proportion of passengers by *cabin floor*. Present your results in a *pie chart*.
4. For each *travel class*, calculate the proportion of the passengers that survived. Present your results in *pie charts*.
5. Calculate the proportion of the passengers that survived by *travel class* and *sex*. Present your results in *a single histogram*.
6. Create 2 equally populated *age categories* and calculate survival proportions by *age category*, *travel class* and *sex*. Present your results in a `DataFrame` with unique index.

In [None]:
# Write your answer here