# Importing Data in Python (Part 2)

## 1 Importing flat files from the web

### 1.1 Importing flat files from the web

In [None]:
# imported a file from the web, 
# saved it locally, 
# and loaded it into a DataFrame
from urllib.request import urlretrieve
import pandas as pd

# Assign url of file: url
url = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1606/datasets/winequality-red.csv'

# Save file locally
urlretrieve(url, 'winequality-red.csv')

# Read file into a DataFrame and print its head
df = pd.read_csv('winequality-red.csv', sep=';')
print(df.head())


In [None]:
# load a file from the web into a DataFrame 
# without first saving it locally
import pandas as pd

# Read file into a DataFrame: df
df = pd.read_csv(url, sep = ';')

#### Importing non-flat files from the web (here excel)

In [None]:
# Import package
import pandas as pd

# Assign url of file: url
url = 'http://s3.amazonaws.com/assets.datacamp.com/course/importing_data_into_r/latitude.xls'

# Read in all sheets of Excel file: xl
xl = pd.read_excel(url, sheetname = None)

# Print the sheetnames to the shell
print(xl.keys())

# Print the head of the first sheet (using its name, NOT its index)
print(xl['1700'].head())


### 1.2 HTTP requests to import files from the web

#### using urllib

In [1]:
# Import packages
from urllib.request import urlopen, Request

# Specify the url
url = "http://www.datacamp.com/teach/documentation"

# This packages the request: request
request = Request(url)

# Sends the request and catches the response: response
response = urlopen(request)

# Extract the response: html
html = response.read()

# Print the html
print(html)

# Be polite and close the response!
response.close()


#### using requests
* requests is a library that is even higher-level than urllib
* you don't have to close the connection when using requests!

In [3]:
# Import package
import requests

# Specify the url: url
url = "http://docs.datacamp.com/teach/"

# Packages the request, send the request and catch the response: r
r = requests.get(url)

# Extract the response: text
text = r.text

# Print the html
print(text)


<!DOCTYPE html>
<link rel="shortcut icon" href="images/favicon.ico" />
<html>

  <head>
  <meta charset="utf-8">
  <meta http-equiv="X-UA-Compatible" content="IE=edge">
  <meta name="viewport" content="width=device-width, initial-scale=1">

  <title>Home</title>
  <meta name="description" content="All Documentation on Course Creation">

  <link rel="stylesheet" href="/teach/css/main.css">
  <link rel="canonical" href="/teach/">
  <link rel="alternate" type="application/rss+xml" title="DataCamp Teach Documentation" href="/teach/feed.xml" />
</head>


  <body>

    <header class="site-header">

  <div class="wrapper">

    <a class="site-title" href="/teach/">DataCamp Teach Documentation</a>

  </div>

</header>


    <div class="page-content">
      <div class="wrapper">
        <p>The Teach Documentation has been moved to <a href="https://www.datacamp.com/teach/documentation">https://www.datacamp.com/teach/documentation</a>!</p>

<!-- Everybody can teach on DataCamp. The resources on t

### 1.3 Scraping the web in Python

In [None]:
import requests
from bs4 import BeautifulSoup

# --- connect and fetch data ---
url = 'https://www.python.org/~guido/'
r = requests.get(url)
html_doc = r.text

# --- Create a BeautifulSoup object ---
soup = BeautifulSoup(html_doc)

# --- Prettify the BeautifulSoup object ---
pretty_soup = soup.prettify()

# --- data type --
type(soup)          # get bs4.BeautifulSoup (this obj has several methods)
type(pretty_soup)   # get str

# --- methods of soup (bs4.BeautifulSoup obj) ---
guido_title = soup.title       # Get the title 

guido_text = soup.get_text()   # Get text

a_tags = soup.find_all('a')    # get a list of <a></a> tags with their content
for link in a_tags:
    print(link.get('href'))    # e.g. for <a href="pics.html">...</a>,
                               # get "pics.html"

## 2 Interacting with APIs to import data from the web 

### 2.1 Introduction to APIs and JSONs

In [None]:
# Load JSON: json_data
with open("a_movie.json") as json_file:
    json_data = json.load(json_file)

type(json_data)    # get dictionary

for k in json_data.keys():
    print(k + ': ', json_data[k])

### 2.2 APIs and interacting with the world wide web

In [7]:
# test
import requests 

url = 'http://www.omdbapi.com/?t=hackers'

r = requests.get(url)

r.text     # get a long long string 
r.json()   # get a dictionary, might be a nested dict

str

## 3 Diving deep into the Twitter API 
什么鬼嘛。。。太high level，一头雾水

In [None]:
# ------- API Authentication -------

# Import package
import tweepy, json

# Store OAuth authentication credentials in relevant variables
access_token = "1092294848-aHN7DcRP9B4VMTQIhwqOYiB14YkW92fFO8k8EPy"
access_token_secret = "X4dHmhPfaksHcQ7SCbmZa2oYBBVSD2g8uIHXsp5CTaksx"
consumer_key = "nZ6EA0FxZ293SxGNg8g8aP0HM"
consumer_secret = "fJGEodwe3KiKUnsYJC3VRndj7jevVvXbK2D5EiJ2nehafRgA6i"

# Pass OAuth details to tweepy's OAuth handler
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)


#### a custom obj [MyStreamListener](https://gist.github.com/hugobowne/18f1c0c0709ed1a52dc5bcd462ac69f4) in the course  
 a Tweet listener that creates a file called 'tweets.txt', collects streaming tweets as .jsons and writes them to the file 'tweets.txt'; once 100 tweets have been streamed, the listener closes the file and stops listening.

In [None]:
# ------- a custom obj MyStreamListener  -------
class MyStreamListener(tweepy.StreamListener):
    def __init__(self, api=None):
        super(MyStreamListener, self).__init__()
        self.num_tweets = 0
        self.file = open("tweets.txt", "w")

    def on_status(self, status):
        tweet = status._json
        self.file.write( json.dumps(tweet) + '\n' )
        self.num_tweets += 1
        if self.num_tweets < 100:
            return True
        else:
            return False
        self.file.close()

    def on_error(self, status):
        print(status)

In [None]:
# ------- Streaming tweets -------

# Initialize Stream listener
l = MyStreamListener()

# Create you Stream object with authentication
stream = tweepy.Stream(auth, l)


# Filter Twitter Streams to capture data by the keywords:
stream.filter(track=['clinton', 'trump', 'sanders', 'cruz'])