In [1]:
from bs4 import BeautifulSoup
import requests
import datetime
import time


# Функция возвращает list of dict по каждому фильму
def get_films(n: int, time_delay: float) -> list:
    if (type(n) is not int) or (type(time_delay) is not float):
        raise TypeError('Wrong type in parameters')
    if time_delay < 0:
        raise ValueError('Time delay should be positive')
    if n > 500 or n <= 0:
        raise ValueError('Фильмы из топ 500, 0 < n <= 500')
    req_n = 0
    films_arr = []
    if n < 101:
        req_n = 1
    elif 100 < n < 201:
        req_n = 2
    elif 200 < n < 301:
        req_n = 3
    elif 300 < n < 401:
        req_n = 4
    elif 400 < n < 501:
        req_n = 5
    count = 0
    for i in range(req_n):
        time.sleep(time_delay)
        adr = 'https://www.kinoafisha.info/rating/movies/?page=' + str(i)
        req = requests.request('GET', adr)
        soup = BeautifulSoup(req.text, 'html.parser')
        films = soup.find_all(attrs={'class': "films_content"})
        for j in range(min(100, n-100*i)):
            film = films[j]
            name = film.find('a', class_="films_name ref").text
            rating = float(film.find('span', class_="rating_num").text)
            year = film.find('span', class_="films_info").text.split(', ')[0]
            url = 'https://www.kinoafisha.info/' + film.find('a').attrs['href']
            _utc_timestamp = str(datetime.datetime.utcnow()).split('.')[0]
#             films_arr.append({'name': name,
#                                'rating': rating,
#                                'year': year,
#                                'url': url,
#                                '_utc_timestamp': _utc_timestamp})
            yield {'name': name,
                   'rating': rating,
                   'year': year,
                   'url': url,
                   '_utc_timestamp': _utc_timestamp}
    # return films_arr

In [2]:
# выкачиваем данные по 150 фильмам с задержкой 0.7с
# films_all = get_films(150, 0.7)
films_all = list(get_films(17, 0.7))
# отсортируем список по рейтингу фильма
films_all = sorted(films_all, key=lambda x: x['rating'], reverse=True)
if len(films_all) >= 3:
    [print(i) for i in films_all[:3]]

{'name': 'FORD против FERRARI', 'rating': 9.1, 'year': '2019', 'url': 'https://www.kinoafisha.info//movies/8354457/', '_utc_timestamp': '2020-06-22 12:58:58'}
{'name': 'Зеленая миля', 'rating': 9.0, 'year': '1999', 'url': 'https://www.kinoafisha.info//movies/4982409/', '_utc_timestamp': '2020-06-22 12:58:58'}
{'name': 'Побег из Шоушенка', 'rating': 9.0, 'year': '1994', 'url': 'https://www.kinoafisha.info//movies/7731571/', '_utc_timestamp': '2020-06-22 12:58:58'}


In [3]:
from collections import Counter
from collections import OrderedDict


# Собираем словарь вида:
# "год выхода фильма -> количество фильмов из топ-500, вышедших в этот год"
y_min = int(films_all[0].get('year'))
y_max = int(films_all[0].get('year'))
y_popular = int(films_all[0].get('year'))
y_popular_c = 0

od = {}
my_dict = Counter()

for film in films_all:
    curr_year = int(film.get("year"))
    if curr_year < y_min:
        y_min = curr_year
    elif curr_year > y_max:
        y_max = curr_year
    if str(curr_year) in my_dict.keys():
        my_dict[str(curr_year)] += 1
    else:
        my_dict[str(curr_year)] = 1
    if my_dict[str(curr_year)] > y_popular:
        y_popular = curr_year
        y_popular_c = my_dict[str(y)]

# print(my_dict)

# Выводим самый популярный год, а также минимальный и максимальный
print('Most popular:', y_popular)
print('Year min:', y_min)
print('Year max:', y_max)

for key, value in od.items():
    print(key, value)

Most popular: 2019
Year min: 1961
Year max: 2020


In [4]:
import csv
import json
from typing import List, Dict, Union, Any


def dump_json(film_list: List[Dict[str, Any]], path: str) -> None:

    with open(path, 'w', encoding='utf-8') as outfile:
        for l in film_list:
            json_record = json.dumps(l, ensure_ascii=False)
            outfile.write(json_record)


def dump_jsonl(film_list: List[Dict[str, Any]], path: str) -> None:

    with open(path, 'w', encoding='utf-8') as outfile:
        for l in film_list:
            print(l)
            json_record = json.dumps(l, ensure_ascii=False)
            outfile.write(json_record + '\n')


def dump_csv(film_list: List[Dict[str, Any]], path: str) -> None:

    with open(path, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['name', 'rating', 'year', 'url', '_utc_timestamp']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for l in film_list:
            writer.writerow(l)


def dump_tsv(film_list: List[Dict[str, Any]], path: str) -> None:

    with open(path, 'wt', encoding='utf-8') as tsvfile:
        fieldnames = ['name', 'rating', 'year', 'url', '_utc_timestamp']
        tsv_writer = csv.DictWriter(tsvfile,
                                    fieldnames=fieldnames,
                                    delimiter='\t')

        for l in film_list:
            tsv_writer.writerow(l)


# Записываем данные в файлы
# Функция должна иметь следующую сигнатуру (набор и тип аргументов)
def dump(film_list: List[Dict[str, Any]],
         path: str,
         filetype: str,
         order_by: str,  # поле словарей, по которому будем сортировать
         ascending: bool) -> None:
    if filetype not in ['json', 'jsonl', 'csv', 'tsv']:
        raise TypeError('Wrong filetype in parameters')
    if order_by not in ['name', 'rating', 'year', 'url', '_utc_timestamp']:
        raise TypeError('Wrong order_by in parameters')
    rvrs = not ascending
    # сортируем по выбранному полю
    films_sorted = sorted(film_list, key=lambda i: i[order_by], reverse=rvrs)
    if filetype is 'json':
        dump_json(films_sorted, path)
    elif filetype is 'jsonl':
        dump_jsonl(films_sorted, path)
    elif filetype is 'csv':
        dump_csv(films_sorted, path)
    elif filetype is 'tsv':
        dump_tsv(films_sorted, path)


# Записываем tab-separated файл, отсортированный по убыванию времени выгрузки.
dump(films_all,
     r"myfile.json",
     filetype='json',
     order_by='rating',
     ascending=True)

## 2 задание

In [5]:
from random import randrange
# Class A 1.0.0.1 to 126.255.255.254
# Class B 128.1.0.1 to 191.255.255.254
# Class C 192.0.1.1 to 223.255.254.254

MAX_VAL = 255


def random_ip() -> str:
    p1 = randrange(0, MAX_VAL)
    p2 = randrange(0, MAX_VAL)
    p3 = randrange(0, MAX_VAL)
    p4 = randrange(0, MAX_VAL)
    ip = str(p1)+'.'+str(p2)+'.'+str(p3)+'.'+str(p4)
    return ip


random_ip()

'40.221.83.211'

In [6]:
import requests
import time
from random import uniform

list_time_zones = []


def ip_list_to_json(local_list: List[str],
                    path: str):
    with open(path, 'w', encoding='utf-8') as outfile:
        for l in local_list:
            url = "https://freegeoip.app/json/" + l
            headers = {
                'accept': "application/json",
                'content-type': "application/json"
            }
            response = requests.request("GET", url, headers=headers)
            ip_dict = response.json()
            ip_str = response.text
            tz = ip_dict.get('time_zone')
            if len(tz) > 0 and tz not in list_time_zones:
                list_time_zones.append(tz)

            # print(tz)
            json_record = json.dumps(ip_str, ensure_ascii=False)
            # print(ip_str)

            outfile.write(ip_str)
            time.sleep(uniform(0.5, 1))

In [7]:
ip_list = []

for i in range(100):
    ip = random_ip()
    while ip in ip_list:
        ip = random_ip()
    ip_list.append(ip)

ip_list_to_json(ip_list, r'ip_data.json')

In [8]:
import datetime
import time
import pytz

# random time moment in 2019
random_moment = randrange(1546300800, 1577836799)
d_local = datetime.datetime.utcfromtimestamp(random_moment).isoformat()

timestring = d_local
d = datetime.datetime.strptime(d_local, "%Y-%m-%dT%H:%M:%S")
d = pytz.timezone('UTC').localize(d)

for tz_local in list_time_zones:
    d = d.astimezone(pytz.timezone(tz_local))
    print(d.tzinfo) # Return time zone info
    print(d.strftime("%Y-%m-%d %H:%M:%S"))
    print()


America/Chicago
2019-03-04 08:12:05

Asia/Riyadh
2019-03-04 17:12:05

Europe/Berlin
2019-03-04 15:12:05

Asia/Shanghai
2019-03-04 22:12:05

America/New_York
2019-03-04 09:12:05

Asia/Nicosia
2019-03-04 16:12:05

Europe/London
2019-03-04 14:12:05

Europe/Helsinki
2019-03-04 16:12:05

Asia/Tokyo
2019-03-04 23:12:05

America/Santiago
2019-03-04 11:12:05

Asia/Seoul
2019-03-04 23:12:05

Europe/Oslo
2019-03-04 15:12:05

America/Sao_Paulo
2019-03-04 11:12:05

Europe/Istanbul
2019-03-04 17:12:05

Europe/Budapest
2019-03-04 15:12:05

Australia/Sydney
2019-03-05 01:12:05

America/Phoenix
2019-03-04 07:12:05

Europe/Bratislava
2019-03-04 15:12:05

Europe/Paris
2019-03-04 15:12:05

Africa/Cairo
2019-03-04 16:12:05

Europe/Stockholm
2019-03-04 15:12:05

America/Denver
2019-03-04 07:12:05

Europe/Lisbon
2019-03-04 14:12:05

Europe/Moscow
2019-03-04 17:12:05

Asia/Taipei
2019-03-04 22:12:05

Europe/Madrid
2019-03-04 15:12:05

Europe/Amsterdam
2019-03-04 15:12:05

America/Los_Angeles
2019-03-04 06:12