In [84]:
#:root { --fs1: 14px; --fs2: 20px; --fs3: 40px}
#:root { --fs1: 80px; --fs2: 40px; --fs3: 30px; --fsp: 25px}

from IPython.core.display import HTML
HTML("""
<style>
:root { --fs1: 80px; --fs2: 40px; --fs3: 30px; --fsp: 25px}
h1 {font-size: var(--fs1) !important; text-align: center; font-family: Palatino Linotype;}
h2 {font-size: var(--fs2) !important; text-align: center; font-family: Palatino Linotype;}
h3 {font-size: var(--fs3) !important; text-align: center; font-family: Palatino Linotype;}
p {font-size: var(--fsp); line-height:125%;}
ul {font-size: var(--fsp); line-height:130%;}
.CodeMirror {font-size: var(--fsp); line-height:125%;}
.output {font-size: var(--fsp); line-height:125%;}
.alert {font-size: var(--fsp); line-height:125%;}
</style>
""") 

<br><br><br><br><br>
<h1> Data Wrangling </h1>

<br><br><br>

<h2> Thomas Donoghue </h2>
<h2> COGS 108 - April 14th, 2017 </h2>
<br><br><br>

<br><br><br><br>

## High Level Learning Goals
<br>
- Notice that data is everywhere, waiting for analysis. 
<br>
- However, dealing with data is mostly 'administrative'.

<br><br><br><br>

## How are we going to do this:
<br>
- How do you organize data into a useable format (for a given project / question)?
- A crash course on file types, data formats, databases, and APIs
<br><br><br><br><br><br>

## Data Sources
<br>
- Files
- Databases
- Web Scraping & APIs
<br>
<hr>

## Friendly File Types:
- csv
- tsv
- json
- txt
- xml

## 'Unfriendly' File Types:
- pdf
- docx
- html
- Anything made to look nice for humans

In [95]:
# Pandas
import pandas as pd

### CSV Files

In [6]:
!cat files/dat.csv

1, 2, 3, 4,
2, 6, 7, 8,
9, 10, 11, 12,

In [109]:
#
import csv

In [110]:
with open('files/dat.csv') as csvfile:
    csv_reader = csv.reader(csvfile, delimiter=',')
    for row in csv_reader:
        print(', '.join(row))

1,  2,  3,  4
2,  6,  7,  8
9,  10,  11,  12


In [106]:
pd.read_csv?

In [114]:
pd.read_csv(open('files/dat.csv'))

Unnamed: 0,1,2,3,4
0,2,6,7,8
1,9,10,11,12


### JSON

In [11]:
!cat files/dat.json

{
  "firstName": "John",
  "age": 53
}


In [10]:
# Think of json's as similar to dictionaries
d = {'firstName': 'John', 'age': '53'}
print(d)

{'firstName': 'John', 'age': '53'}


In [115]:
import json

In [118]:
# Load a json file
with open('files/dat.json') as dat_file:    
    dat = json.load(dat_file)

In [120]:
print(type(dat))

<class 'dict'>


In [122]:
pd.read_json?

In [126]:
pd.read_json('{ "first": "Alan", "place": "Manchester"}', typ='series')

first          Alan
place    Manchester
dtype: object

In [127]:
pd.read_json(open('files/dat.json'), typ='series')

age            53
firstName    John
dtype: object

### XML

In [139]:
!cat files/dat.xml

<person>
	<who>Claude</who>
	<what>Info</who>
	<when>50s</when>
</person>

In [141]:
# 
with open('files/dat.xml') as dat_file:
    dat = dat_file.read()

In [142]:
dat

'<person>\n\t<who>Claude</who>\n\t<what>Info</who>\n\t<when>50s</when>\n</person>'

In [144]:
from bs4 import BeautifulSoup
nice_dat = BeautifulSoup(dat, 'xml')

In [145]:
nice_dat

<?xml version="1.0" encoding="utf-8"?>
<person>
<who>Claude</who>
<what>Info</what>
<when>50s</when>
</person>

### Text Files

In [22]:
!cat files/dat.txt

This is an unstructured text file.
    It can have all sort
        of
            stuff in it.


Super duper.

### PDFs

In [105]:
!cat files/dat.pdf

%PDF-1.3
%���������
4 0 obj
<< /Length 5 0 R /Filter /FlateDecode >>
stream
x]�=�0����ǭL�h2���ĩحB����"�wܽ�so�#�syc	��~r�E{����k��5�l�QYG���N9��*�jE�l-dS�h���p�l���]��?��â���X���"%�ܩ�[�,��!'�21o6���>/�1�
endstream
endobj
5 0 obj
153
endobj
2 0 obj
<< /Type /Page /Parent 3 0 R /Resources 6 0 R /Contents 4 0 R /MediaBox [0 0 612 792]
>>
endobj
6 0 obj
<< /ProcSet [ /PDF /Text ] /ColorSpace << /Cs1 7 0 R >> /Font << /TT1 8 0 R
>> >>
endobj
9 0 obj
<< /Length 10 0 R /N 3 /Alternate /DeviceRGB /Filter /FlateDecode >>
stream
x��wTS��Ͻ7��" %�z	 �;HQ�I�P��&vDF)VdT�G�"cE��b�	�P��QDE�݌k	�5�ޚ��Y�����g�}׺ P���tX�4�X���\���X��ffG�D���=���HƳ��.�d��,�P&s���"7C$ 
E�6<~&��S��2����)2�12�	��"�įl���+�ɘ�&�Y��4���Pޚ%ᣌ�\�%�g�|e�TI� ��(����L 0�_��&�l�2E�� ��9�r��9h� x�g��Ib�טi���f��S�b1+��M�xL����0��o�E%Ym�h�����Y��h����~S�=�z�U�&�ϞA��Y�l�/� �$Z����U �m@��O�  � �ޜ��l^���'���ls�k.+�7���oʿ�9�����V;�?�#I3eE妧�KD

In [7]:
from IPython.display import IFrame
IFrame("./files/dat.pdf", width=750, height=300)

<br><br><br><br><br><br><br><br><br>

# Databases
<br>
<div class="alert alert-success">
A database is an organized collection of data. More formally, 'database' refers to a set of related data, and the way it is organized. 
</div>



<br>
<img src="img/sql.png" alt="sql" height="400" width="400">

## Application Program Interface (APIs)

Notes on APIs:
- Follow API guidelines! 
    - These guidelines typically specify the number / rate / size of requests

<br>
<img src="img/pubmed.png" alt="sql" height="400" width="400">

In [92]:
# Pubmed

<br>
<img src="img/twitter.png" alt="sql" height="400" width="400">

In [89]:
# Accessing Twitter API from Python

# Import tweepy to access API
import tweepy
from tweepy import OAuthHandler

# Import my API credentials
from stw import *

# Twitter API requires Authentification with OAuth
auth = OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)

# Create an API object to access Twitter
api = tweepy.API(auth)

for status in tweepy.Cursor(api.home_timeline).items(3):
    # Process a single status
    print(status.user.name)
    print(status.text, '\n') 

Laverne Cox
Congrats @candiscayne. #Amazing https://t.co/ppkpX47mXT 

AsapSCIENCE
For more visualizations and to learn more visit: https://t.co/D2rgDM7bK5 https://t.co/p0vDPirpQL 

Shit Academics Say
RT @AcademicsSay: "What it feels like to start a manuscript from scratch" - @stephenaguilar https://t.co/QpQBBEymbf 



In [87]:
api?

In [121]:
# Authorized Access - OAuth

In [None]:
# Twitter

<br>
<img src="img/github.png" alt="sql" height="400" width="400">

In [19]:
# Github
https://api.github.com/users/tomdonoghue

SyntaxError: invalid syntax (<ipython-input-19-811718d31541>, line 2)

In [26]:
import requests
from bs4 import BeautifulSoup

In [27]:
page = requests.get('https://api.github.com/users/tomdonoghue')

In [28]:
BeautifulSoup?
#page.content

In [2]:
# Web Scraping vs. APIs

In [33]:
# Unstructured Text / Data
!cat files/00000201_eg.txt

CLINICAL HISTORY:  This is a 75-year-old female with history of bilateral MCA stroke who presents with staring spells concerned for complex partial seizures.

MEDICATIONS:  Levetiracetam, gabapentin, metoprolol, atorvastatin.

INTRODUCTION:  Digital video EEG was performed at the bedside using standard 10/20 system of electrode placement with 1 channel EKG.  Hyperventilation and photic stimulation were not performed.

DESCRIPTION OF THE RECORD:  This is a technically limited EEG with muscle artifact throughout the recording.  There was no cerebral waveform activity were detectable.  Heart rate was 60 beats per minute and regular.

IMPRESSION:  This is a technically limited EEG due to diffuse muscle artifact throughout the recording.

CLINICAL IMPRESSION:  The EEG was not readable.  A repeat EEG is recommended.





In [77]:
labels = []
dat = []
ser = pd.Series()
ser

Series([], dtype: float64)

In [78]:
df = pd.DataFrame()
df

In [81]:
#df.loc[]

In [83]:
#good_dat = df.loc[df['IsDataReadable'] == True]
#print(good_dat)

## Data Wrangling: How did we get here?

### Data Science is Ad-Hoc
- It is part of the job description to put things together that were not designed to go together.
- We do not have universal solutions, but haphazard, idiosyncratic systems, for data collection, storage and analysis.
- Data is everywhere. But relatively little of it was collected *as data*.

### Data is Inherently Noisy

- We live in a messy, noisy, world, with messy, noisy, people, using messy, noisy instruments.

- Real world data means you are probing a system with 

### Different Objectives

- Humans and computers are different.
- We interact with '*data*' in different ways.


# So?

- Consider ecological validity (does your data map to the real world)
    - This likely scales with your data wrangling & cleaning issues
- Consider 
- Develop a theory of mind for {computers, python}

## Context
- What is the context of the question?
- What is the contex the data?

<img src="img/data_gov.png" alt="gov_dat" height="500" width="750">

![]()

<img src="img/sd_data.png" alt="sd_dat" height="500" width="750">

In [None]:
# Scraping & Parsing 

Extracting data from pdf's: tabula
- https://github.com/tabulapdf/tabula

In [20]:
# ETL - Extract, Transform, Load

Object `bs4` not found.
