# Data Loading and Storage & File Formats 

In [2]:
# Python has become a beloved language for text and file munging due to its simple syntax
# for interacting with files, intuitive data structures, and convenient features like tuple
# packing and unpacking. 
# Function Description
# read_csv >>>>> Load delimited data from a file, URL, or file-like object. Use comma as default delimiter
# read_table >>>>> Load delimited data from a file, URL, or file-like object. Use tab ('\t') as default delimiter
# read_fwf >>>>>> Read data in fixed-width column format (that is, no delimiters)
# read_clipboard >>>>> Version of read_table that reads data from the clipboard. Useful for converting tables from web pages 

In [5]:
import pandas as pd
# df = pd.read_csv('examples/ex1.csv') # read_csv 

In [1]:
# We could also have used read_table and specified the delimiter: 
# df = pd.read_table('examples/ex1.csv', sep=',') # read_table

In [3]:
# A file will not always have a header row. Consider this file: 
# To read this file, you have a couple of options. You can allow pandas to assign default
# column names, or you can specify names yourself:
# df = pd.read_csv('examples/ex2.csv', header=None) # read_csv

In [4]:
# pd.read_csv('examples/ex2.csv', names=['a', 'b', 'c', 'd', 'message']) # read_csv

In [5]:
# Suppose you wanted the message column to be the index of the returned DataFrame. 
# You can either indicate you want the column at index 4 or names 'message' using the index_col argument: 
names = ['a', 'b', 'c', 'd', 'message'] 
# df = pd.read_csv('examples/ex2.csv', names=names, index_col='message') # read_csv with names and index_col

In [6]:
# In the event that you want to form a hierarchical index from multiple columns, pass a list of column numbers or names:
# parsed = pd.read_csv('examples/csv_mindex.csv', index_col=['key1', 'key2']) # read_csv with hierarchical index 

In [7]:
# In some cases a table might not have a fixed delimiter, using whitespace or some other pattern to separate fields.
# In these cases, you can pass a regular expression as a delimiter for read_table.
# Consider a text file that looks like this: 
# list(open('examples/ex3.txt')) # read_table with regular expression as delimiter 

In [8]:
# While you could do some munging by hand, the fields are separated by a variable amount of whitespace. 
# In these cases, you can pass a regular expression as a delimiter for read_table. 
# This can be expressed by the regular expression \s+, so we have then: 
# result = pd.read_table('examples/ex3.txt', sep='\s+') # read_table with regular expression as delimiter 

In [9]:
# Because there was one fewer column name than the number of data rows, 
# read_table infers that the first column should be the DataFrame's index in this special case. 
# You can explicitly indicate that you want the first column of data to be the index using the index_col argument: 
# result = pd.read_table('examples/ex3.txt', sep='\s+', index_col=[0, 1]) 
# # read_table with regular expression as delimiter and hierarchical index  

In [10]:
# hey 
# skiprows allows you to skip over rows, either at the beginning, end, or at specific indices.
# For example, we might want to skip the first, third, and fourth rows of a file like this:
# list(open('examples/ex4.csv')) # read_csv with skiprows
# df = pd.read_csv('examples/ex4.csv', skiprows=[0, 2, 3]) # read_csv with skiprows 

In [11]:
# Handling Missing Values is an important and frequently nuanced part of the file parsing process. 
# Missing data is usually either not present (empty string) or marked by some sentinel value 
# Most tabular data formats represent missing data as NA:
# result = pd.read_csv('examples/ex5.csv') # read_csv with missing values 
# result = pd.read_csv('examples/ex5.csv', na_values=['NULL']) # read_csv with missing values and na_values


In [12]:
# Different NA sentinels can be specified for each column in a dict: 
# sentinels = {'message': ['foo', 'NA'], 'something': ['two']} 
# result = pd.read_csv('examples/ex5.csv', na_values=sentinels) # read_csv with missing values and na_values in dict 

In [13]:
# # Argument Description
# path >>>> String indicating filesystem location, URL, or file-like object
# sep or delimiter >>>>> Character sequence or regular expression to use to split fields in each row
# header >>>>> Row number to use as column names. Defaults to 0 (first row), but should be None if there is no header row
# index_col >>>> Column numbers or names to use as the row index in the result. Can be a single name/number or a list 
# of them for a hierarchical index

# names >>>>> List of column names for result, combine with header=None
# skiprows >>>>>> Number of rows at beginning of file to ignore or list of row numbers (starting from 0) to skip
# na_values >>>>>> Sequence of values to replace with NA
# comment >>>>> Character or characters to split comments off the end of lines
# parse_dates >>> Attempt to parse data to datetime; False by default. If True, will attempt to parse all columns. Otherwise
# can specify a list of column numbers or name to parse. If element of list is tuple or list, will combine
# multiple columns together and parse to date (for example if date/time split across two columns)

# keep_date_col >>>> If joining columns to parse date, drop the joined columns. Default True
# converters >>> Dict containing column number of name mapping to functions. For example {'foo': f} would apply
# the function f to all values in the 'foo' column
# dayfirst >>>>> When parsing potentially ambiguous dates, treat as international format (e.g. 7/6/2012 -> June 7,
# 2012). Default False
# date_parser >>>> Function to use to parse dates 
# nrows >>>> Number of rows to read from beginning of file
# iterator >>>>>  Return a TextParser object for reading file piecemeal
# chunksize >>> For iteration, size of file chunks
# skip_footer >>>>> Number of lines to ignore at end of file
# verbose >>>>> Print various parser output information, like the number of missing values placed in non-numeric data
# encoding >>>>> Text encoding for Unicode (e.g. 'utf-8' for UTF-8 encoded text)
# squeeze >>>> If the parsed data only contains one column, return a Series
# thousands >>>>> Separator for thousands (e.g. ',' or '.')
# decimal >>>> Character to recognize as decimal point (e.g. ',' or '.')

# Reading Text Files in Pieces 

In [14]:
# When processing large files or figuring out the right set of arguments to correctly process a large file, 
# you may only want to read in a small piece of a file or iterate through smaller chunks of the file. 
# Before we look at a large file, we make the pandas display settings more compact: 
# pd.options.display.max_rows = 10  # read_csv with nrows 
# result = pd.read_csv('examples/ex6.csv') # read_csv with nrows 

In [15]:
# If you want to only read out a small number of rows (avoiding reading the entire file), specify that with nrows:
# result = pd.read_csv('examples/ex6.csv', nrows=5) # read_csv with nrows 

In [16]:
# To read a file in pieces, specify a chunksize as number of rows:
# chunker = pd.read_csv('examples/ex6.csv', chunksize=1000) # read_csv with chunksize 

In [17]:
# The TextParser object returned by read_csv allows you to iterate over the parts of the file according to the chunksize.
# For example, we can iterate over ex6.csv, aggregating the value counts in the 'key' column like so: 
# tot = pd.Series([])
# for piece in chunker:
#     tot = tot.add(piece['key'].value_counts(), fill_value=0)
# tot = tot.sort_values(ascending=False) # read_csv with chunksize


# Writing Data Out to Text Format

In [18]:
# Data can also be exported to a delimited format. Let's  consider one of the CSV files read before: 
# data = pd.read_csv('examples/ex5.csv') # read_csv 
# data.to_csv('examples/out.csv') # to_csv 

In [19]:
# Using DataFrame's to_csv method, we can write the data out to a comma-separated file: 
# import sys
# data.to_csv(sys.stdout, sep='|') # to_csv with sys.stdout 

In [20]:
# Misssing values appear as empty strings in the output. You might want to denote them by some other sentinel value:
# data.to_csv( na_rep='NULL') # to_csv with sys.stdout and na_rep 

In [21]:
# With no other options specified, both the row and column labels are written. Both of these can be disabled:
# data.to_csv(sys.stdout, index=False, header=False) # to_csv with sys.stdout and index=False and header=False 

In [22]:
# You can also write only a subset of the columns, and in an order your choosing: 
# data.to_csv(index=False, columns=['a', 'b', 'c']) # to_csv with sys.stdout and index=False and columns 

# JSON Data 

In [24]:
# JSON (JavaScript Object Notation) has become one of the standard formats for sending data by HTTP request
# between web browsers and other applications.
# It is a much more free-form data format than a tabular text form like CSV. Here is an example:
obj = """ {
    "name": "Wes",
    "places_lived": ["United States", "Spain", "Germany"],
    "pet": null,
    "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"},
                {"name": "Katie", "age": 33, "pet": "Cisco"}]
} """



In [25]:
# JSON is very nearly valid Python code with the exception of its null value null and some other nuances 
# like disallowing trailing commas at the end of lists. The basic types are objects (dicts), arrays (lists), 
# strings, numbers, booleans, and nulls. All of the keys in an object must be strings. There are several 
# Python libraries for reading and writing JSON data. I'll use json here, as it is built into the Python
# standard library. To convert a JSON string to Python form, use json.loads:
import json
result = json.loads(obj) # json.loads
result 

{'name': 'Wes',
 'places_lived': ['United States', 'Spain', 'Germany'],
 'pet': None,
 'siblings': [{'name': 'Scott', 'age': 25, 'pet': 'Zuko'},
  {'name': 'Katie', 'age': 33, 'pet': 'Cisco'}]}

In [26]:
# json.dumps, on the other hand, converts a Python object back to JSON:
json.dumps(result) # json.dumps 

'{"name": "Wes", "places_lived": ["United States", "Spain", "Germany"], "pet": null, "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"}, {"name": "Katie", "age": 33, "pet": "Cisco"}]}'

In [27]:
asjson = json.dumps(result) # json.dumps 

In [29]:
# How you convert a JSON object or list of objects to a DataFrame or some other data structure for analysis will be up to you. 
# Conveniently, you can pass a list of JSON objects to the Dataframe constructor and select a subset of the data fields:
import pandas as pd  
siblings = pd.DataFrame(result['siblings'], columns=['name', 'age']) # json.dumps with DataFrame constructor 


In [30]:
siblings

Unnamed: 0,name,age
0,Scott,25
1,Katie,33


# XML and HTML: Web Scraping 

In [4]:
# ! pip install lxml # install lxml

In [8]:
# ! pip install urllib2 # install urllib2

In [3]:
# ! pip install beautifulsoup4 html5lib # install beautifulsoup4 html5lib 

In [None]:
# import urlopen from urllib2 
# from urllib2 import urlopen 

In [13]:
# ! pip install requests # install requests 

In [24]:
# from lxml.html import parse
# from urllib.request import urlopen
# from urllib.error import HTTPError

# url = 'http://finance.yahoo.com/q/op?s=AAPL+Options'

# try:
#     parsed = parse(urlopen(url))
#     doc = parsed.getroot()
#     # Rest of your code here
# except HTTPError as e:
#     print(f"Error opening the URL: {url}")
#     print(f"HTTP Error code: {e.code}")

In [21]:
# ! pip install requests lxml seleniumi # install requests lxml seleniumi 

In [25]:
# from selenium import webdriver

# url = 'https://finance.yahoo.com/q/op?s=AAPL+Options'

# try:
#     driver = webdriver.Chrome()
#     driver.get(url)
#     parsed = html.fromstring(driver.page_source)
#     # Rest of your code here
# except Exception as e:
#     print(f"Error accessing the URL: {url}")
#     print(f"Error details: {e}")
# finally:
#     driver.quit()


In [26]:
# import requests
# from lxml import html

# url = 'https://finance.yahoo.com/quote/AAPL/options?ltr=1'

# try:
#     response = requests.get(url)
#     response.raise_for_status()  # Check if the request was successful
#     parsed = html.fromstring(response.text)
#     # Rest of your code here
# except requests.RequestException as e:
#     print(f"Error accessing the URL: {url}")
#     print(f"Error details: {e}")

# Binary Data Formats

In [27]:
# One of the easiest ways to store data efficiently in binary format is using Python's built-in pickle serialization. 
# pandas objects all have a to_pickle method that writes the data to disk in pickle format:
# frame = pd.read_csv('examples/ex1.csv') # read_csv 
# frame.to_pickle('examples/frame_pickle') # to_pickle 


In [28]:
# frame.save('examples/frame_pickle') # save 
# You read the data back into Python with pandas.load, another pickle convenience function: 
# pd.load('examples/frame_pickle') # load 

# Using HDF5 Format 

In [30]:
# There are a number of tools that facilitate reading and writing large amounts of scientific data in binary format on disk. 
# A popular industry-grade library for this is HDF5, which stands for Hierarchical Data Format.
# While it's possible to directly access HDF5 from Python, there are many libraries available that abstract away 
# many of the details of working directly with HDF5. For the purposes of this book, I'll focus on pandas' tools for HDF5,
# though there are many other options (such as h5py). The HDFStore class works like a dict and handles the low-level 
# details: 
import pandas as pd 
import numpy as np 
frame = pd.DataFrame({'a': np.random.randn(100)}) # DataFrame constructor 

In [31]:
# Each HDF5 file contains an internal file system-like node structure enabling you to store multiple datasets and supporting metadata. 
# Compared with simpler formats, HDF5 supports on-the-fly compression with a variety of compressors, enabling data with repeated
# patterns to be stored more efficiently. 
# HDF5 can be a good choice for working with very large datasets that don't fit into memory,
# as you can efficiently read and write small sections of much larger arrays.  
# While it's possible to directly access HDF5 from Python, there are many libraries available that abstract away 
# There are not one but two interfaces to the HDF5 library in Python. PyTables and h5py each of which takes a different approach 
# to the problem. h5py provides a direct, but high-level interface to the HDF5 API, while PyTables abstracts many of the details of 
# HDF5 to provide multiple flexible data containers, table indexing, querying capabilities, and some support for out of core. 

store = pd.HDFStore('mydata.h5') # HDFStore constructor 


In [32]:
store['obj1'] = frame # HDFStore with obj1 

In [33]:
store['obj1_col'] = frame['a'] # HDFStore with obj1_col

In [34]:
store # HDFStore 

<class 'pandas.io.pytables.HDFStore'>
File path: mydata.h5

In [35]:
store['obj1'] # HDFStore with obj1 

Unnamed: 0,a
0,2.505155
1,-1.360989
2,0.316219
3,-0.552582
4,0.379378
...,...
95,-1.126696
96,-1.005149
97,-1.409088
98,-0.030770


In [36]:
# If you work with huge quantities of data, I would encourage you to explore PyTables and h5py to see how they can suit your needs.
# Since many data analysis problems are IO-bound(rather than CPU-bound), using a tool like HDF5 can massively accelerate your applications. 


# Reading Microsoft Excel Files

In [37]:
# xls = pd.ExcelFile('examples/ex1.xlsx') # ExcelFile constructor 
# table = xls.parse('Sheet1') # parse 

# Interacting with  HTML and Web APIs

In [38]:
# Many websites have public APIs providing data feeds via JSON or some other format. 
# There are a number of ways to access these APIs from Python; one easy-to-use method that I recommend is the requests package. 
# http://docs.python-requests.org  

import requests 
url = 'https://api.github.com/repos/pandas-dev/pandas/issues' # url 

In [39]:
resp = requests.get(url) # get 

In [40]:
resp 

<Response [200]>

In [41]:
import json 
data = json.loads(resp.text) # json.loads 

In [43]:
data[0]['title'] # json.loads with title 

'TYP: Persist typing information for pipe args and kwargs'

In [44]:
data

[{'url': 'https://api.github.com/repos/pandas-dev/pandas/issues/56760',
  'repository_url': 'https://api.github.com/repos/pandas-dev/pandas',
  'labels_url': 'https://api.github.com/repos/pandas-dev/pandas/issues/56760/labels{/name}',
  'comments_url': 'https://api.github.com/repos/pandas-dev/pandas/issues/56760/comments',
  'events_url': 'https://api.github.com/repos/pandas-dev/pandas/issues/56760/events',
  'html_url': 'https://github.com/pandas-dev/pandas/pull/56760',
  'id': 2068990003,
  'node_id': 'PR_kwDOAA0YD85jZhRZ',
  'number': 56760,
  'title': 'TYP: Persist typing information for pipe args and kwargs',
  'user': {'login': 'paw-lu',
   'id': 30049606,
   'node_id': 'MDQ6VXNlcjMwMDQ5NjA2',
   'avatar_url': 'https://avatars.githubusercontent.com/u/30049606?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/paw-lu',
   'html_url': 'https://github.com/paw-lu',
   'followers_url': 'https://api.github.com/users/paw-lu/followers',
   'following_url': 'https://api.g

# Interacting with Databases 

In [45]:
# In many applicationds data rarely comes from text files, that being a fairly inefficient way to store large amounts of data.
# SQL-based relational databases (such as SQL Server, PostgreSQL, and MySQL) are in wide use, and many alternative 
# non-SQL (so-called NoSQL) databases have become quite popular.
# The choice of database is usually dependent on the performance, data integrity, and scalability needs of an application. 
# Loading data from SQL into a DataFrame is fairly straightforward, and pandas has some functions to simplify the process. 
# As an example, I'll create a SQLite database using Python's built-in sqlite3 driver: 

import sqlite3 
query = """ CREATE TABLE test (a VARCHAR(20), b VARCHAR(20), c REAL, d INTEGER); """ # query
con = sqlite3.connect('mydata.sqlite') # sqlite3.connect
con.execute(query) # execute
con.commit() # commit


In [46]:
# Insert a few rows of data: 
data = [('Atlanta', 'Georgia', 1.25, 6), 
        ('Tallahassee', 'Florida', 2.6, 3), 
        ('Sacramento', 'California', 1.7, 5)] # data
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)" # stmt
con.executemany(stmt, data) # executemany
con.commit() # commit

In [47]:
cursor = con.execute('select * from test') # execute 
rows = cursor.fetchall() # fetchall
rows # rows 

[('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5)]

In [48]:
# You can pass the list of tuples to the DataFrame constructor, but you also need the column names,
# contained in the cursor's description attribute:
cursor.description # description 

(('a', None, None, None, None, None, None),
 ('b', None, None, None, None, None, None),
 ('c', None, None, None, None, None, None),
 ('d', None, None, None, None, None, None))

# Storing and Loading Data in MongoDB 

In [53]:
# ! pip install pymango # install pymango 

In [56]:
# #NoSQL databases take many different forms. Some are simple dict-like key-value stores
# #like BerkeleyDB or Tokyo Cabinet, while others are document-based, with a dict-like
# #object being the basic unit of storage. I've chosen MongoDB (http://mongodb.org) for
# #my example. I started a MongoDB instance locally on my machine, and connect to it
# #on the default port using pymongo, the official driver for MongoDB:
# # import pymongo
# # import mangodb
# con = pymongo.Connection('localhost', port=27017)
# #Documents stored in MongoDB are found in collections inside databases. Each running
# #instance of the MongoDB server can have multiple databases, and each database can
# #have multiple collections. Suppose I wanted to store the Twitter API data from earlier
# #in the chapter. First, I can access the (currently empty) tweets collection:
# tweets = con.db.tweets
# #Then, I load the list of tweets and write each of them to the collection using
# #tweets.save (which writes the Python dict to MongoDB):
# import requests, json
# url = 'http://search.twitter.com/search.json?q=python%20pandas'
# data = json.loads(requests.get(url).text)
# for tweet in data['results']:
#     tweets.save(tweet)
# #Now, if I wanted to get all of my