# 02. Getting the data

In [1]:
import numpy as np
import pandas as pd

## 02. Reading data from csv files

In [2]:
df = pd.read_csv('data/file.csv')
df

Unnamed: 0,a,b,c,d
0,yellow,10,2,3.2
1,green,2,3,8.1
2,blue,7,1,0.4


In [3]:
pd.read_csv('data/file.csv', header=None)

Unnamed: 0,0,1,2,3
0,a,b,c,d
1,yellow,10,2,3.2
2,green,2,3,8.1
3,blue,7,1,0.4


In [4]:
pd.read_csv('data/file.csv',names=['column 1','column 2','column 3','column 4'])


Unnamed: 0,column 1,column 2,column 3,column 4
0,a,b,c,d
1,yellow,10,2,3.2
2,green,2,3,8.1
3,blue,7,1,0.4


In [5]:
pd.read_csv('data/file.csv', index_col=0)


Unnamed: 0_level_0,b,c,d
a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
yellow,10,2,3.2
green,2,3,8.1
blue,7,1,0.4


In [6]:
pd.read_csv("data/file.csv", usecols=['a', 'b'])

Unnamed: 0,a,b
0,yellow,10
1,green,2
2,blue,7


In [7]:
df.dtypes

a     object
b      int64
c      int64
d    float64
dtype: object

In [8]:
df2 = pd.read_csv('data/file.csv',  dtype = { 'b' : np.float64})
df2.dtypes

a     object
b    float64
c      int64
d    float64
dtype: object

## 03. Reading data from Excel files

In [9]:
pd.read_excel('data/data.xls')

Unnamed: 0,varA,varB,varC
0,0.391723,-0.155122,0.381104
1,0.575125,-0.105817,0.232245
2,0.672305,0.424688,-0.694795
3,0.766115,-0.79135,-0.028739
4,0.677259,-0.817543,-0.537088
5,-0.029702,-0.891848,-0.682719
6,-0.161366,-0.6596,-0.727898
7,0.031672,0.016607,-0.940479
8,0.833212,-0.503236,-0.88721
9,0.907753,0.265177,-0.390762


In [10]:
pd.read_excel('data/data.xls', sheet_name='Sheet2',usecols=[0,1])

Unnamed: 0,varD,varE
0,0.907753,0.265177
1,0.755019,-0.768056
2,0.850692,-0.537159
3,0.131663,0.941327
4,0.5744,0.091735
5,0.81663,0.875612
6,0.536732,0.175428
7,-0.084641,-0.042827
8,0.268271,-0.010628
9,0.166792,-0.872579


## 04. JSON data

In [11]:
import json
from pandas.io.json import json_normalize

In [12]:
pd.read_json('data/books.json')

Unnamed: 0,books
0,"{'isbn': '9781593275846', 'title': 'Eloquent J..."
1,"{'isbn': '9781449331818', 'title': 'Learning J..."
2,"{'isbn': '9781449365035', 'title': 'Speaking J..."


In [13]:
with open('data/books.json', 'r') as f:
    json_string = f.read()
    dictionary = json.loads(json_string)

In [14]:
dictionary

{'books': [{'isbn': '9781593275846',
   'title': 'Eloquent JavaScript, Second Edition',
   'subtitle': 'A Modern Introduction to Programming',
   'author': 'Marijn Haverbeke',
   'published': '2014-12-14T00:00:00.000Z',
   'publisher': 'No Starch Press',
   'pages': 472,
   'description': 'JavaScript lies at the heart of almost every modern web application, from social apps to the newest browser-based games. Though simple for beginners to pick up and play with, JavaScript is a flexible, complex language that you can use to build full-scale applications.',
   'website': 'http://eloquentjavascript.net/'},
  {'isbn': '9781449331818',
   'title': 'Learning JavaScript Design Patterns',
   'subtitle': "A JavaScript and jQuery Developer's Guide",
   'author': 'Addy Osmani',
   'published': '2012-07-01T00:00:00.000Z',
   'publisher': "O'Reilly Media",
   'pages': 254,
   'description': "With Learning JavaScript Design Patterns, you'll learn how to write beautiful, structured, and maintainable 

In [15]:
json_normalize(dictionary, 'books')

Unnamed: 0,author,description,isbn,pages,published,publisher,subtitle,title,website
0,Marijn Haverbeke,JavaScript lies at the heart of almost every m...,9781593275846,472,2014-12-14T00:00:00.000Z,No Starch Press,A Modern Introduction to Programming,"Eloquent JavaScript, Second Edition",http://eloquentjavascript.net/
1,Addy Osmani,"With Learning JavaScript Design Patterns, you'...",9781449331818,254,2012-07-01T00:00:00.000Z,O'Reilly Media,A JavaScript and jQuery Developer's Guide,Learning JavaScript Design Patterns,http://www.addyosmani.com/resources/essentialj...
2,Axel Rauschmayer,"Like it or not, JavaScript is everywhere these...",9781449365035,460,2014-02-01T00:00:00.000Z,O'Reilly Media,An In-Depth Guide for Programmers,Speaking JavaScript,http://speakingjs.com/


## 05. HTML files

* Download HTML code using the requests library
* Create a BeautifulSoup object to contain the parse HTML code
* Look for patterns identifying the information that you want to extract from the code
* Search for specific tags using the find_all() method
* Iterate over the object returned by find_all() and use the text attribute to extract the text * between each set of tags
* Store the strings in a Python list and convert to a DataFrame for further analysis

In [16]:
import requests

In [17]:
page=requests.get('https://web.archive.org/web/20180908144902/en.proverbia.net/shortfamousquotes.asp')

In [18]:
page.text[0:100]

'\n<!DOCTYPE html>\n\n<html lang="en" xml:lang="en">\n<head><script src="//archive.org/includes/analytics'

In [19]:
page.status_code

200

## 06. Web scraping

In [20]:
from bs4 import BeautifulSoup

In [21]:
soup = BeautifulSoup(page.text, 'html.parser')

In [22]:
quotes = soup.find_all('blockquote')

In [30]:
quote_list = []
for quote in quotes:
    string = quote.text
    quote_list.append(string)

In [31]:
df = pd.DataFrame(quote_list, columns=['Quote'])
df

Unnamed: 0,Quote
0,There is a natural aristocracy among men. The ...
1,All our words from loose using have lost their...
2,"God couldn't be everywhere, so he created moth..."
3,"Be not afraid of going slowly, be afraid only ..."
4,"Learn from yesterday, live for today, hope for..."
5,Do not confine your children to your own learn...
6,"I hear and I forget, I see and I remember. I d..."
7,In teaching others we teach ourselves.
8,Happiness will never come to those who fail to...
9,"Without His love I can do nothing, with His lo..."


In [24]:
authors=soup.find_all('p', class_="a")

In [27]:
authors[0].text[1:-1]


'Thomas Jefferson (1743-1826) Third president of the United States.'

In [32]:
author_list=[]
for author in authors:
    string = author.text[1:-1]
    author_list.append(string)
df['Author']=author_list
df

Unnamed: 0,Quote,Author
0,There is a natural aristocracy among men. The ...,Thomas Jefferson (1743-1826) Third president o...
1,All our words from loose using have lost their...,Ernest Hemingway (1898-1961) American Writer.
2,"God couldn't be everywhere, so he created moth...",Jewish proverb
3,"Be not afraid of going slowly, be afraid only ...",Chinese proverb
4,"Learn from yesterday, live for today, hope for...",Unknown Source
5,Do not confine your children to your own learn...,Chinese proverb
6,"I hear and I forget, I see and I remember. I d...",Chinese proverb
7,In teaching others we teach ourselves.,Proverb
8,Happiness will never come to those who fail to...,Unknown Source
9,"Without His love I can do nothing, with His lo...",Unknown Source


In [33]:
tables = pd.read_html("https://world.openfoodfacts.org/additives")
print(len(tables))  # 1 
print(tables[0].head())

1
                   Additive  Products   * Risk
0        E330 - Citric acid    130608 NaN  NaN
1          E322 - Lecithins     88562 NaN  NaN
2          E322i - Lecithin     80999 NaN  NaN
3  E500 - Sodium carbonates     55590 NaN  NaN
4        E415 - Xanthan gum     49628 NaN  NaN


In [38]:
tables = pd.read_html("https://en.wikipedia.org/wiki/World_record_progression_50_metres_freestyle")
print(len(tables))  # 7

print(tables[4].head()) 


9
   Pos   Time                   Swimmer              Date          Venue
0    1  20.91         Cesar Cielo (BRA)  17 December 2009         Brazil
1    2  20.94  Frederick Bousquet (FRA)     22 April 2009         France
2    3  21.04      Caeleb Dressel (USA)      27 July 2019    South Korea
3    4  21.11      Benjamin Proud (GBR)     3 August 2018  Great Britain
4    5  21.19       Ashley Callus (AUS)  26 November 2009      Australia


In [39]:
print(tables[-2].head())

   Pos  Swimmer                       Time              Date          Venue
0    1    22.93  Ranomi Kromowidjojo (NED)     7 August 2017        Germany
1    2    23.00       Sarah Sjöström (SWE)     7 August 2017        Germany
2    3    23.19        Cate Campbell (AUS)   27 October 2017         Russia
3    4    23.25     Marleen Veldhuis (NED)     13 April 2008  Great Britain
4    5    23.27    Therese Alshammar (SWE)  21 November 2009      Singapore


In [47]:
tables = pd.read_html("https://en.wikipedia.org/wiki/World_record_progression_50_metres_freestyle", match="United States")
print(len(tables))  # 1
#print(tables[0][10:15][['Time', 'Name', 'Nationality']])

7


## 07. Getting data from the web using APIs

In [49]:
import json

# Load credentials
with open('data/client-credentials.json') as file:
    client_credentials = json.load(file)

print('Credentials:', list(client_credentials.keys())) # ['client_id', 'client_secret']

Credentials: ['client_id', 'client_secret']


In [50]:
print(client_credentials['client_id']) # Client ID

...
