In [1]:
# GETTING DATA
# To do anything you need data. let'see how and where we can get data.

In [2]:
import sys, re

# a small python script if run in command line reads lines of text and write out the ones that match a regular expression:
# sys.argv is the list of command-line arguments
# sys.argv[0] is the name of the program itself
# sys.argv[1] will be the regex specified at the command line
regex = sys.argv[1]
# for every line passed into the script
for line in sys.stdin:
    # if it matches the regex, write it to stdout
    if re.search(regex, line):
        sys.stdout.write(line)

# another python script that counts the lines it receives and then writes out the count:
# line_count.py
import sys
count = 0
for line in sys.stdin:
    count += 1
# print goes to sys.stdout
print(count)

# you can use both to count how many lines of a file contain numbers:
# type SomeFile.txt | egrep.py "[0-9]" | line_count.py

# or again a script that counts the words in its input and writes out the most common ones:
# most_common_words.py
import sys
from collections import Counter
# pass in number of words as first argument
try:
    num_words = int(sys.argv[1])
except:
    print("usage: most_common_words.py num_words")
    sys.exit(1)   # nonzero exit code indicates error
counter = Counter(word.lower()                      # lowercase words
                  for line in sys.stdin
                  for word in line.strip().split()  # split on spaces
                  if word)                          # skip empty 'words'
for word, count in counter.most_common(num_words):
    sys.stdout.write(str(count))
    sys.stdout.write("\t")
    sys.stdout.write(word)
    sys.stdout.write("\n")
""" type the_bible.txt | most_common_words.py 10
36397	the
30031	and
20163	of
7154	to
6484	in
5856	that
5421	he
5226	his
5060	unto
4297	shall """

In [None]:
# to read files we can use "open"
# 'r' means read-only, it's assumed if you leave it out
file_for_reading = open('reading_file.txt', 'r')
file_for_reading2 = open('reading_file.txt')

# 'w' is write -- will destroy the file if it already exists!
file_for_writing = open('writing_file.txt', 'w')

# 'a' is append -- for adding to the end of the file
file_for_appending = open('appending_file.txt', 'a')

# don't forget to close your files when you're done
file_for_writing.close()

#since it is easy to forget to close files it's better to use "with"
starts_with_hash = 0
with open('input.txt') as f:
    for line in f:                  # look at each line in the file
        if re.match("^#",line):     # use a regex to see if it starts with '#'
            starts_with_hash += 1   # if it does, add 1 to the count

# as an example we can extract the domains from a file with a list of email addresses
def get_domain(email_address: str) -> str:
    """Split on '@' and return the last piece"""
    return email_address.lower().split("@")[-1]

# a couple of tests
assert get_domain('joelgrus@gmail.com') == 'gmail.com'
assert get_domain('joel@m.datasciencester.com') == 'm.datasciencester.com'

from collections import Counter

with open('email_addresses.txt', 'r') as f:
    domain_counts = Counter(get_domain(line.strip())
                            for line in f
                            if "@" in line)

In [None]:
# to handle delimited files like csv, don't do it yourself, use a csv reader
# a file tab separated without headers
"""
6/20/2014   AAPL    90.91
6/20/2014   MSFT    41.68
6/20/2014   FB  64.5
6/19/2014   AAPL    91.86
6/19/2014   MSFT    41.51
6/19/2014   FB  64.34
"""
import csv
with open('tab_delimited_stock_prices.txt') as f:
    tab_reader = csv.reader(f, delimiter='\t')
    for row in tab_reader:
        date = row[0]
        symbol = row[1]
        closing_price = float(row[2])
# a file colon separated with headers
"""
date:symbol:closing_price
6/20/2014:AAPL:90.91
6/20/2014:MSFT:41.68
6/20/2014:FB:64.5
"""
with open('colon_delimited_stock_prices.txt') as f:
    colon_reader = csv.DictReader(f, delimiter=':')
    for dict_row in colon_reader:
        date = dict_row["date"]
        symbol = dict_row["symbol"]
        closing_price = float(dict_row["closing_price"])
# if your file does not have headers you can use "DictReder" by passing the keys
todays_prices = {'AAPL': 90.91, 'MSFT': 41.68, 'FB': 64.5 }
with open('comma_delimited_stock_prices.txt', 'w') as f:
    csv_writer = csv.writer(f, delimiter=',')
    for stock, price in todays_prices.items():
        csv_writer.writerow([stock, price])

In [1]:
# another way to get data is by scraping the web. BeautifulSoup is a usefull and easy library we can use
# in HTML we have structured data. In an ideal world we could find all the tags starting with <p and use the ids(that sometimes are missing).
"""
<html>
  <head>
    <title>A web page</title>
  </head>
  <body>
    <p id="author">Joel Grus</p>
    <p id="subject">Data Science</p>
  </body>
</html>
"""
from bs4 import BeautifulSoup
import requests

# Example HTML file on GitHub. In order to fit
# the URL in the book I had to split it across two lines.
# Recall that whitespace-separated strings get concatenated.
url = ("https://raw.githubusercontent.com/"
       "joelgrus/data/master/getting-data.html")
html = requests.get(url).text
soup = BeautifulSoup(html, 'html5lib')
first_paragraph = soup.find('p')        # or just soup.p
first_paragraph_text = soup.p.text      # get the text of a tag
first_paragraph_words = soup.p.text.split()
first_paragraph_id = soup.p['id']       # raises KeyError if no 'id'
first_paragraph_id2 = soup.p.get('id')  # returns None if no 'id'
# you could need to find tags with a specific class
important_paragraphs = soup('p', {'class' : 'important'})
important_paragraphs2 = soup('p', 'important')
important_paragraphs3 = [p for p in soup('p')
                         if 'important' in p.get('class', [])]
# or if you want to find every <span> contained inside a <div>
# Warning: will return the same <span> multiple times
# if it sits inside multiple <div>s.
# Be more clever if that's the case.
spans_inside_divs = [span
                     for div in soup('div')     # for each <div> on the page
                     for span in div('span')]   # find each <span> inside it

In [6]:
# many websites provide APIs, which allow you to explicitly request data in a structured format.
# usually they are JSON formatted
import json
serialized = """{ "title" : "Data Science Book",
                  "author" : "Joel Grus",
                  "publicationYear" : 2019,
                  "topics" : [ "data", "science", "data science"] }"""
# parse the JSON to create a Python dict
deserialized = json.loads(serialized)
assert deserialized["publicationYear"] == 2019
assert "data science" in deserialized["topics"]

# let's start with unauthenticated GitHub APIs
from collections import Counter
from dateutil.parser import parse
github_user = "joelgrus"
endpoint = f"https://api.github.com/users/{github_user}/repos"
repos = json.loads(requests.get(endpoint).text)

# get created dates of all repos
dates = [parse(repo["created_at"]) for repo in repos]
month_counts = Counter(date.month for date in dates)
weekday_counts = Counter(date.weekday() for date in dates)

# get the languages of the last 5 repositories
last_5_repositories = sorted(repos,
                             key=lambda r: r["pushed_at"],
                             reverse=True)[:5]
last_5_languages = [repo["language"]
                    for repo in last_5_repositories]
print(last_5_languages)
# APIs are a very powerfull and usefull tool. 
# They could be well written and give you a lot of flexibility but unfortunatly sometimes are not updated or not reliable and can give you huge headhaches.

['JavaScript', 'Python', 'Python', 'Python', 'Python']


In [None]:
"""
For Further Exploration
pandas is the primary library that data science types use for working with—and, in particular, importing—data.

Scrapy is a full-featured library for building complicated web scrapers that do things like follow unknown links.

Kaggle hosts a large collection of datasets.
"""