# Data Exctraction
This notebook is a sandbox for extracting all of the data from the csvs and converting to a more useful object, such as Pandas DataFrame or JSON.

## Imports

In [1]:
import pandas as pd
import os
import sys

## Read in the Data

In [2]:
def get_file_list(data_directory):
    '''
    Return a list of all the csvs within a directory
    '''
    file_list = []
    for root, dirs, files in os.walk(data_directory):
        for name in files:
            if name.endswith('.csv'):
                file_name = os.path.join(root, name)
                file_list.append(file_name)
    return file_list

In [3]:
def parse_file_name(file_name):
    '''
    Given a csv's file name, extract the building, floor, room and room_type
    '''
    file = file_name.split("/")[-1]
    date = file.split("_")[0]
    location_info = file.replace('.csv','').replace(date,'').replace('_if_us_dc_','')
    building, floor, room_and_type = location_info.split("_")
    if '-' in room_and_type:
        room_and_type_split = room_and_type.split("-")
        room = room_and_type_split[0]
        room_type = room_and_type_split[1]
    else:
        room = room_and_type
        room_type = None
    
    return building, floor, room, room_type



In [17]:
def csv_to_df(file_name):
    '''
    Given the path to a csv, extract location data from the file name 
    and return that along with the data as a pandas DataFrame
    '''
    df = pd.read_csv(file_name, 
                     parse_dates = [0], 
                     dtype = 'float')
    cols = df.columns
    df = df.rename(mapper = {'Unnamed: 0':'Date Time', 'datetime':'Date Time'},
                   axis = 'columns')
    building, floor, room, room_type = parse_file_name(file_name)
    df['building'] = building
    df['floor'] = floor
    df['room'] = room
    df['room_type'] = room_type
    
    return df
    

In [5]:
def make_final_df():
    file_list = get_file_list('data')
    dfs = []
    for f in file_list:
        df = csv_to_df(f)
        dfs.append(df)
    final_df = pd.concat(dfs, ignore_index=True, sort=True)
    
    return final_df

In [18]:
final_df = make_final_df()

In [24]:
final_df.head()

Unnamed: 0,CO2__ppm,CO__ppm,Date Time,Humidity__pct,PN1Plus__counts_per_L,Pressure__mbar,Temperature__deg_C,building,floor,humidity_2_rel_true_pct_avg,light_lux,room,room_type,sound_dba_avg,temperature_2_celsius_avg,vibration_ms2_avg
0,533.073,0.827,2017-02-24 00:00:00,47.583,130.622,1008.922,22.788,1800f,2,49.143,3.679,2462,,46.562,22.755,0.102
1,530.667,0.831,2017-02-24 00:01:00,47.618,,1008.913,22.786,1800f,2,49.162,3.756,2462,,46.587,22.75,0.103
2,522.754,0.83,2017-02-24 00:02:00,47.626,,1008.925,22.788,1800f,2,49.185,4.072,2462,,46.587,22.75,0.104
3,527.022,0.826,2017-02-24 00:03:00,47.656,,1008.947,22.787,1800f,2,49.197,4.065,2462,,46.57,22.747,0.104
4,529.83,0.827,2017-02-24 00:04:00,47.633,,1008.968,22.781,1800f,2,49.193,4.062,2462,,46.571,22.741,0.103


# Transform the Data