# Warsztaty Python w Data Science

## API jako źródło danych
## Web Scraping 3 z 3
---

### - JSON
### - REST - co to jest
### - New York Time API
### - Dane w języku naturalnym: analiza
---

## JSON

In [None]:
import json

data = {'year': [2010, 2011, 2012, 2011, 2012, 2010, 2011, 2012],
        'team': ['Bears', 'Bears', 'Bears', 'Packers', 'Packers', 'Lions',
                 'Lions', 'Lions'],
        'wins': [11, 8, 10, 15, 11, 6, 10, 4],
        'losses': [5, 8, 6, 1, 5, 10, 6, 12]}


In [None]:
s = json.dumps(data)
s

In [None]:
json.loads(s)

In [None]:
data == json.loads(s)

---
## REST

In [None]:
API_KEY = "tu wpisz swoj klucz"

https://developer.nytimes.com/get-started

In [None]:
url = 'https://api.nytimes.com/svc/books/v3/lists/names.json?api-key=' + API_KEY

In [None]:
response = requests.get(url).json()
response


---
## New York Time API

https://developer.nytimes.com/get-started


In [None]:
url = 'https://api.nytimes.com/svc/books/v3/lists/names.json?api-key=' + API_KEY

In [None]:
response = requests.get(url).json()

In [None]:
response.keys()

In [None]:
response['results'][:3]

In [None]:
[l['list_name'] for l in response['results']]

In [None]:
url = 'https://api.nytimes.com/svc/books/v3/lists/current/hardcover-fiction.json?api-key=' + API_KEY

In [None]:
response = requests.get(url).json()
response.keys()

In [None]:
response['results'].keys()

In [None]:
response['results']['books'][0].keys()

In [None]:
[ book['title'] for book in response['results']['books'][:15]]

In [None]:
url = 'https://api.nytimes.com/svc/books/v3/lists/2023-01-31/hardcover-fiction.json?api-key=' + API_KEY

In [None]:
response = requests.get(url).json()
[ book['title'] for book in response['results']['books'][:15 ]]

In [None]:
url = 'https://api.nytimes.com/svc/books/v3/reviews.json?author=Stephen+King&api-key=' + API_KEY

In [None]:
response = requests.get(url).json()

In [None]:
response.keys()

In [None]:
response['results'][:3]

In [None]:
[ (book['book_title'], book['summary']) for book in response['results'][:30] if len(book['summary']) > 0]

---


In [None]:
import os
import json
import time
import requests
import datetime
import dateutil
import pandas as pd
from dateutil.relativedelta import relativedelta

In [None]:
base_url = 'https://api.nytimes.com/svc/archive/v1/'
url = base_url + '/' + '2023' + '/' + "3" + '.json?api-key=' + YOUR_API_KEY
response = requests.get(url).json()


In [None]:
response.keys()

In [None]:
response['response'].keys()

In [None]:
len(response['response']['docs'])

In [None]:
response['response']['docs'][0]

In [None]:
from porter import PorterStemmer

In [None]:
stemmer = PorterStemmer()

In [None]:
stemmer.stem('programming')

In [None]:
stemmer.stem('errors')

In [None]:
def is_valid(article):
    has_headline = type(article['headline']) == dict and 'main' in article['headline'].keys()
    return has_headline


def parse(response):
    data = {'headline': [],  
        'date': [], 
        'doc_type': [],
        'material_type': [],
        'section': [],
        'keywords': []}
    
    articles = response['response']['docs'] 
    for article in articles: # For each article, make sure it falls within our date range
        date = dateutil.parser.parse(article['pub_date']).date()
        if is_valid(article):
            data['date'].append(date)
            data['headline'].append(article['headline']['main']) 
            if 'section' in article:
                data['section'].append(article['section_name'])
            else:
                data['section'].append(None)
            data['doc_type'].append(article['document_type'])
            if 'type_of_material' in article: 
                data['material_type'].append(article['type_of_material'])
            else:
                data['material_type'].append(None)
            keywords = [keyword['value'] for keyword in article['keywords'] if keyword['name'] == 'subject']
            data['keywords'].append(keywords)
    return pd.DataFrame(data) 

In [None]:
df = parse(response)

In [None]:
df

---
## Dane w języku naturalnym: analiza

In [None]:
headlines = [ doc['headline']['main'] for doc in response['response']['docs']]

In [None]:
headlines[:15]

In [None]:
all_headlines = ' '.join(headlines)

In [None]:
## Tokenizer

In [None]:
import re

tokenizer = re.compile(r'[\W]+')
tokenized = tokenizer.split(all_headlines)
str(tokenized[:15])

In [None]:
tokenized = [ word.lower() for word in tokenized]
print(f"Mamy {len(tokenized)} wyrazów")

In [None]:
counter = {}

for w in tokenized:
    counter[w] = counter.get(w,0)+1

print(f"Mamy {len(counter.keys())} RÓŻNYCH wyrazów")

In [None]:
counted_words= [ (word,cnt) for word,cnt in counter.items() ]
counted_words[:4]

In [None]:
from operator import itemgetter

counted_words.sort(key=itemgetter(1), reverse=True)
counted_words[:20]

In [None]:
counted_words[-20:]

In [None]:
counts = [ x[1] for x in counted_words ]
len(counts)

In [None]:
sum(counts)

In [None]:
sum(counts[:107])

In [None]:
counted_words[97:117]

In [None]:
count_df = pd.DataFrame(counts[:120]).reset_index()
columns = list(count_df.columns)
columns[1] = 'count'
count_df.columns = columns
count_df

In [None]:
columns = list(count_df.columns)
columns[1] = 'count'
count_df.columns = columns
count_df

In [None]:
plt.figure(figsize=(20,10))
plt.style.use("dark_background")

chart = sns.lineplot(x='index',
                     y='count',
                     linestyle='-', marker='o',
                     palette='bright',  markeredgecolor="black",   
                     data=count_df
                    )
chart.set_ylabel('Ilość wystąpień')
chart.set_xlabel('Rank wyrazu')
chart.ticklabel_format(useOffset=False, style='plain')
chart.set_title('Wyrazy w artykułach NYT')
plt.show();

In [None]:
plt.figure(figsize=(20,10))
plt.style.use("dark_background")

chart = sns.lineplot(x='index',
                     y='count',
                     linestyle='-', marker='o',
                     palette='bright',  markeredgecolor="black",   
                     data=count_df
                    )
chart.set(yscale='log')
chart.set_ylabel('Ilość wystąpień (logarytmicznie)')
chart.set_xlabel('Rank wyrazu')
chart.set_title('Wyrazy w artykułach NYT')
plt.show();