In [1]:
# matplotlib is the basic plotting (graphical) library, it is very old and poverfull, but 
# also too complicated sometimes.
%matplotlib inline 
### this is jupyter "magic" command that lets matplotlib to add plots (graphics) to the notebook
### if you want, try `%matplotlib notebook` instead, it will add some interactivity to the plot

In [4]:
import pylab as plt
### actually pylab is also part of the matplotlib, but this time we import it as an object to work with.

In [2]:
## lets import pandas
## we use "pd" as an alias - you don't have to, but it is popular convention, and also much faster to write
## you can use this instead:
## import pandas
## import pandas as whatever_I_want
import pandas as pd

# by the way, all above is a common "boilerplate code", by which I mean standart piece of the code 
# that you can find everywhere, sometimes even if I don't need those modules

## Reading data from CSV

usually you start reading data as a dataframe (that represents a table in "Excel" terms)
lets try the csv first

this is a first 30 rows (plus header) of the 201607-citibike-tripdata.csv, which you can find [here](https://s3.amazonaws.com/tripdata/index.html)
by the way, this file was created with the following bash pipe:
`head -n 31 201607-citibike-tripdata.csv >> 201601-citibike-tripdata_first_30.csv`

In [5]:
df = pd.read_csv('../l2/data/201601-citibike-tripdata_first_30.csv') # file sits in the l2/data folder

In [6]:
#how many rows dataset contains?
print('There are {} rows in the dataset'.format(len(df)))

There are 30 rows in the dataset


In [7]:
#how many columns dataset contains?
print('There are {} columns in the dataset'.format(len(df.columns)))

There are 15 columns in the dataset


In [8]:
## can I get both at the same time?
df.shape

(30, 15)

In [11]:
#can I take a look at the dataset?
df.head(2) # number of rows

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,704,7/1/2016 00:00:02,7/1/2016 00:11:47,459,W 20 St & 11 Ave,40.746745,-74.007756,347,Greenwich St & W Houston St,40.728846,-74.008591,17431,Customer,,0
1,492,7/1/2016 00:00:18,7/1/2016 00:08:31,293,Lafayette St & E 8 St,40.730287,-73.990765,466,W 25 St & 6 Ave,40.743954,-73.991449,24159,Subscriber,1984.0,1


In [12]:
#tail?
df.tail(2)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
28,209,7/1/2016 00:02:31,7/1/2016 00:06:01,410,Suffolk St & Stanton St,40.720664,-73.98518,473,Rivington St & Chrystie St,40.721101,-73.991925,25845,Subscriber,1984.0,1
29,492,7/1/2016 00:02:32,7/1/2016 00:10:44,481,S 3 St & Bedford Ave,40.712605,-73.962644,3109,Banker St & Meserole Ave,40.72606,-73.95621,23648,Subscriber,1991.0,2


## Excel

In [None]:
#you can read files from excel with the similar function
# df = pd.read_excel('path')

## Reading from the web

In [14]:
# you can also read data from the web - just replace path with the url. 
# this is usefull if you want to get the most recent data.
# for example, you can publish data in google spreadsheet and
# publish it as csv (File>Publish), then load it as an ordinary csv, take a look:

![](https://www.dropbox.com/s/m8ib8fytvzv3r87/Screenshot%202017-03-25%2020.12.00.png?raw=1)

In [15]:
## Reading from the web
link = 'https://docs.google.com/spreadsheets/d/1L6pro6_4y5hT7SybHnbnh0YZYFjLLTVANxXY7ws_gf4/pub?gid=0&single=true&output=csv'
cities = pd.read_csv(link)

In [16]:
cities

Unnamed: 0,City,population,Country,Capital
0,Moscow,11000000.0,Russia,True
1,New York,8000000.0,US,False
2,Washington D.C.,653000.0,US,True
3,New Delhi,18980000.0,India,True


## Reading from Json

now, you CAN read data from json into the dataframe, but there are limitations,
as DataFrame structure works only with the information that can be viewed as a table (think of a list of similarly-structured objects. Each of them then will be presented as a row in the table)



Let me use live stream of data from NYC Citibike.

In [31]:
link = 'https://gbfs.citibikenyc.com/gbfs/en/station_status.json'
# here, we have to drill down to "stations" to get "table-styled" data, so we cant
# simply pass the link to pandas

In [39]:
from urllib import request # reads data from the url
import json # parses and stores json data

In [40]:
data = json.load(request.urlopen(link))
data.keys()

dict_keys(['last_updated', 'ttl', 'data'])

In [42]:
print('There are {} stations in the dataset'.format(len(data['data']['stations'])))

There are 664 stations in the dataset


In [43]:
data['data']['stations'][0]

{'eightd_has_available_keys': False,
 'is_installed': 1,
 'is_renting': 1,
 'is_returning': 1,
 'last_reported': 1490486677,
 'num_bikes_available': 13,
 'num_bikes_disabled': 0,
 'num_docks_available': 26,
 'num_docks_disabled': 0,
 'station_id': '72'}

In [44]:
# now we have the data already - just as a list of dictionaries.
# lets convert it to the dataframe
stations = pd.DataFrame(data['data']['stations'])

In [45]:
stations.head(3)

Unnamed: 0,eightd_has_available_keys,is_installed,is_renting,is_returning,last_reported,num_bikes_available,num_bikes_disabled,num_docks_available,num_docks_disabled,station_id
0,False,1,1,1,1490486677,13,0,26,0,72
1,False,1,1,1,1490486168,31,1,1,0,79
2,False,1,1,1,1490487357,3,0,24,0,82


## Reading from the database

Now, pandas makes it very easy to get data from the database.
in this case we will use cartodb service as a web-based POSTGRESQL database, using special wrapper function.
it is not exactly how you usually acces databases, but pretty similar

In [108]:
from urllib.parse import urlencode # that converts dictionary to the set of url-encoded params


def myQueryCartoDB(query='SELECT * FROM nyc_discharge_2013 LIMIT 3', user='casyfill'):
    '''this function reads data from cartodb service. dataset need to be "public" '''
    source = 'http://{user}.cartodb.com/api/v2/sql'.format(user = user)
    
    params = { 'q':query, 
                'format': 'CSV'} # we will discuss other formats in a while
    link = "?".join([source, urlencode(params)])
    
    return pd.read_csv(link) 

In [109]:
df = myQueryCartoDB(query = 'SELECT adres, admarea, global_id,  soor, vyvad from mosplus01 LIMIT 4', 
                    user='pbk236') # my nyu login

In [110]:
df

Unnamed: 0,adres,admarea,global_id,soor,vyvad
0,"2-я улица Синичкина, дом 4, строение 13",Юго-Восточный административный округ,21041147,Строение,адрес утвержден распорядительным документом
1,"улица Лётчика Бабушкина, владение 21А",Северо-Восточный административный округ,20840549,Строение,адрес утвержден распорядительным документом
2,"улица Достоевского, дом 23",Центральный административный округ,21194382,Строение,адрес утвержден распорядительным документом
3,"улица Шумкина, дом 5",Восточный административный округ,20885523,Строение,адрес утвержден распорядительным документом


# Storing the data

In [111]:
# to csv
df.to_csv('../l2/data/3_buildings.csv')

In [113]:
# to excel
df.to_excel('../l2/data/3_buildings.xlsx')

In [114]:
# to binary msgpack 
# (experimental, but can store type of the variables, including datatime -> fast to read for large files)
df.to_msgpack('../l2/data/3_buildings.msg')