In [1]:
import vk_api
import requests
from bs4 import BeautifulSoup
import re
import sqlite3
conn = sqlite3.connect("cinemas.db")
cursor = conn.cursor()

# Создание таблицы брендов

In [2]:
try:
    cursor.execute('''CREATE TABLE brand(
                    id integer PRIMARY KEY,
                    name text NOT NULL)''')
except sqlite3.OperationalError:
    print('Таблица уже создана!')

In [3]:
try:
    cursor.execute("insert into brand values (1, 'КАРО')")
    conn.commit()
except sqlite3.IntegrityError:
    print('Каро уже добавлен!')

# Часть1. Создание таблицы кинозалов

In [4]:
try:
    cursor.execute("""CREATE TABLE cinema_halls(
                id integer PRIMARY KEY,
                brand_id integer Not NULL,
                site_id integer Not NULL,
                name text NOT NULL,
                address text NOT NULL,
                metro text NULL,
                phone text NOT NULL,
                FOREIGN KEY (brand_id) REFERENCES brand(id)
                )""")
except sqlite3.OperationalError:
    print('Таблица уже создана!')

In [5]:
def remove_all(string):
    pattern = re.compile(r'[А-Яа-яёЁ0-9 ]+')
    return pattern.findall(string)[0].strip()

In [6]:
def find_all_theaters_KARO(theatres):
    dicti = {}
    metro_class = 'cinemalist__cinema-item__metro__station-list__station-item'
    for theater in theatres:
        dicti[theater.findAll('h4')[0].text.strip()] = {
            'metro': [remove_all(i.text) for i in theater.findAll('li', class_=metro_class)], 
            'address': theater.findAll('p')[0].text.split('+')[0].strip(),
            'phone': '+' + theater.findAll('p')[0].text.split('+')[-1],
            'data-id': theater['data-id']
        }
    return dicti

In [7]:
url = "https://karofilm.ru"
url_theaters = url + "/theatres"

In [8]:
r = requests.get(url_theaters)
if r.status_code == 200:
    soup = BeautifulSoup(r.text, "html.parser")
    theatres = soup.findAll('li', class_='cinemalist__cinema-item')
    karo_theatres = find_all_theaters_KARO(theatres)
else:
    print("Страница не найдена")

Создаем базу данных кинотеаров

In [9]:
for i, theater in enumerate(karo_theatres):
    adress = karo_theatres[theater]['address']
    metro = ", ".join(karo_theatres[theater]['metro'])
    phone = karo_theatres[theater]['phone']
    site_id = karo_theatres[theater]['data-id']
    cursor.execute(f"insert into cinema_halls values ('{i + 1}', {1}, '{site_id}', '{theater}', '{adress}', '{metro}', '{phone}')")
    conn.commit()

# Часть2. Создание таблицы фильмов

In [10]:
try:
    cursor.execute("""CREATE TABLE cinemas(
                id integer PRIMARY KEY,
                site_id integer NOT NULL,
                name text NOT NULL,
                duration integer NOT NULL,
                language text NOT NULL,
                genres text NOT NULL
                )""")
except sqlite3.OperationalError:
    print('Таблица уже создана!')

In [11]:
def age(cinema_href):
    url_cinema = url + "/film/" + cinema_href
    r = requests.get(url_cinema)
    soup = BeautifulSoup(r.text, "html.parser")
    film_age = soup.findAll('span', class_='fp_header-age')[0].text
    return film_age

In [12]:
def find_all_cinemas_KARO(cinemas):
    cinemas_dicti = {}
    for cinema in cinemas: 
        cinema_title = cinema.findAll('h3', class_='afisha-title')[0].text.strip()
        try:
            cinema_genres = ", ".join(cinema.findAll('p',class_='afisha-genre')[0].text.split(' / '))
        except IndexError:
            cinema_genres='undefined' 
        if cinema['data-cinemas']:
            cinema_hallss = cinema['data-cinemas'].split(',')
        else:
            cinema_hallss = []
        cinema_href = cinema['data-id']
        cinema_duration = cinema.findAll('span', class_="afisha-duration-time")[0].text
        cinema_age = age(cinema_href)
        cinema_language='undefined'
        cinemas_dicti[cinema_title] = {'age': cinema_age, 'duration': cinema_duration, 'language': cinema_language, 'genres': cinema_genres, 'halls': cinema_hallss,'href': cinema_href}
    return cinemas_dicti

In [13]:
r = requests.get(url)
if r.status_code == 200:
    soup2 = BeautifulSoup(r.text, "html.parser")
    cinemas = soup2.findAll('div', class_='afisha-item')
    karo_cinemas = find_all_cinemas_KARO(cinemas)
else:
    print("Страница не найдена")

In [14]:
for i, cinema in enumerate(karo_cinemas):
    duration = karo_cinemas[cinema]['duration']
    language = karo_cinemas[cinema]['language']
    genres = karo_cinemas[cinema]['genres']
    href = karo_cinemas[cinema]['href']
    cursor.execute(f"insert into cinemas values ('{i + 1}', '{href}', '{cinema}', '{duration}', '{language}', '{genres}')")
    conn.commit()

# Часть 3. Создание таблицы сеансов

In [15]:
try:
    cursor.execute("""CREATE TABLE sessions(
                id integer PRIMARY KEY,
                cinema_id integer Not NULL,
                hall_id integer Not NULL,
                date date NOT NULL,
                form text NOT NULL,
                time time NOT NULL,
                price integer NOT NULL,
                FOREIGN KEY (cinema_id) REFERENCES cinemas(id),
                FOREIGN KEY (hall_id) REFERENCES cinema_halls(id)
                )""")
except sqlite3.OperationalError:
    print('Таблица уже создана!')

In [16]:
def get_id(name, base):
    list_base = cursor.execute(f'select * from "{base}"').fetchall()
    if base == "cinemas":
        k = 2
    elif base == "cinema_halls":
        k = 3
    for element in list_base:
        if (name == element[k]) or (name in element[k]) or (element[k] in name):
            return element[0]
    return name

In [17]:
left = "cinema-page-item__schedule__row__board-row__left"
right = "cinema-page-item__schedule__row__board-row__right"
i = 1
for theater in karo_theatres:
    dates_dicti = {}
    url_sessions = url_theaters + "?id=" + karo_theatres[theater]['data-id']
    r = requests.get(url_sessions)
    if r.status_code == 200:
        soup3 = BeautifulSoup(r.text, "html.parser")
        d = soup3.findAll('select', class_='widget-select')[0]
        dates = [i['data-id'] for i in d.findAll('option')]
        for date in dates: 
            url_sessions_date = url_sessions + "&date=" + date
            r2 = requests.get(url_sessions_date)
            dates_dicti[date] = {}
            if r2.status_code == 200:
                films_on_date = BeautifulSoup(r2.text,'html.parser')
                films_list = films_on_date.findAll('div',class_ = 'cinema-page-item__schedule__row')
                for film in films_list:
                    title = film.findAll('h3')[0].text
                    ogran = [", 0+", ", 6+", ", 12+", ", 16+", ", 18+"]
                    for org in ogran:
                        if org in title:
                            title = title.split(org)[0].strip()
                    dates_dicti[date][title] = {}
                    for types in film.findAll('div', class_ = 'cinema-page-item__schedule__row__board-row'):
                        vision = types.findAll('div',class_ = left)[0].text.strip()
                        time = [t.text for t in types.findAll('div',class_ = right)[0].findAll('a')]
                        dates_dicti[date][title][vision] = time
            else:
                print('Нет даты с url=', url_sessions_date)  
        
    else:
        print('Нет кинотеатра с url=', url_sessions)
    for date, item in dates_dicti.items():
        for name, sess in item.items():
            cinema_id = get_id(name, "cinemas")
            hall_id = get_id(theater, "cinema_halls")
            for form, time in sess.items():
                t = ", ".join(time)
                cursor.execute(f"insert into sessions values ('{i}', '{cinema_id}', '{hall_id}', '{date}', '{form}', '{t}', '{0}')")
                i += 1
    conn.commit()

# Mori cinema

In [18]:
try:
    cursor.execute("insert into brand values (2, 'Mori cinema')")
    conn.commit()
except sqlite3.IntegrityError:
    print('Mori уже добавлен!')

Кинозалы Mori

In [19]:
def find_all_theaters_Mori(theatres):
    dicti = {}
    for theater in theatres:
        dicti[theater.findAll('h2')[0].text.strip()] = {
            'metro': None, 
            'address': theater.findAll('p')[0].text.strip(),
            'phone': None,
            'data-id': theater.findAll('a', class_="btn_cinema", href=True)[0]["href"]
        }
    return dicti

In [20]:
url = "https://mori-cinema.ru"
url_theaters = url + "/cinema"

In [21]:
r = requests.get(url_theaters)
if r.status_code == 200:
    soup = BeautifulSoup(r.text, "html.parser")
    theatres = soup.findAll('ul', class_ ="list_cinema")[0].findAll('li')
    mori_theatres = find_all_theaters_Mori(theatres)
else:
    print("Страница не найдена")

In [22]:
last_id_cin = cursor.execute('select id from cinema_halls').fetchall()
n = len(last_id_cin)
for i, theater in enumerate(mori_theatres):
    adress = mori_theatres[theater]['address']
    metro = mori_theatres[theater]['metro']
    phone = mori_theatres[theater]['phone']
    site_id = mori_theatres[theater]['data-id']
    cursor.execute(f"insert into cinema_halls values ('{n + 1}', {2}, '{site_id}', '{theater}', '{adress}', '{metro}', '{phone}')")
    n += 1
    conn.commit()

Фильмы в Mori

In [23]:
def age_Mori(cinema_href):
    url_cinema = url + cinema_href
    r = requests.get(url_cinema)
    soup = BeautifulSoup(r.text, "html.parser")
    film_age = soup.findAll('div', class_='films_data')[0].findAll('p')[0].text.strip().split()[-1]
    return film_age

In [24]:
def find_all_cinemas_Mori(cinemas):
    cinemas_dicti = {}
    for cinema in cinemas: 
        try:
            cinema_title = cinema.findAll('span', class_='title')[0].text.strip()
        except IndexError:
            continue
        else:    
            try:
                cinema_info = cinema.findAll('span',class_='name')[0].text.strip().split()
                cinema_genres = ", ".join(cinema_info[:len(cinema_info) - 2])
                cinema_duration = cinema_info[len(cinema_info) - 2]
            except IndexError:
                cinema_genres='undefined' 
                cinema_duration = 'undefined' 
            cinema_hallss = []
            cinema_href = cinema.findAll('a', href=True)[0]['href']
            cinema_age = age_Mori(cinema_href)
            cinema_language='undefined'
            cinemas_dicti[cinema_title] = {'age': cinema_age, 'duration': cinema_duration, 'language': cinema_language, 'genres': cinema_genres, 'halls': cinema_hallss,'href': cinema_href}
    return cinemas_dicti

In [25]:
url_films = url +"/films"
r = requests.get(url_films)
if r.status_code == 200:
    soup2 = BeautifulSoup(r.text, "html.parser")
    cinemas = soup2.findAll('div', class_='films')
    mori_cinemas = find_all_cinemas_Mori(cinemas)
else:
    print("Страница не найдена")

In [26]:
last_id_cin = cursor.execute('select id from cinemas').fetchall()
n = len(last_id_cin)
for i, cinema in enumerate(mori_cinemas):
    duration = mori_cinemas[cinema]['duration']
    language = mori_cinemas[cinema]['language']
    genres = mori_cinemas[cinema]['genres']
    href = mori_cinemas[cinema]['href']
    t = cursor.execute('select name from cinemas').fetchall()
    flag = False
    for j in t:
        if cinema in j:
            flag = True
    if not flag:
        cursor.execute(f"insert into cinemas values ('{n + 1}', '{href}', '{cinema}', '{duration}', '{language}', '{genres}')")
        n += 1
    conn.commit()

In [27]:
def str_with_zero(n):
    if n < 10:
        return "0"+str(n)
    else:
        return str(n)

In [28]:
def dates_my(line_data):
    mon_di = {"янв.":"1", "фев.":"2", "мар.":"3", "апр.":"4", "май":"5", "июн.":"6",
                 "июл.":"7", "авг.":"8", "сен.":"9", "окт.":"10", "ноя.":"11", "дек.":"12"}
    year = "2020"
    if line_data[1].strip() == "дек.":
        year = "2019"
    d = year+"-"+str_with_zero(int(mon_di[line_data[1].strip()]))+"-"+ str_with_zero(int(line_data[0].strip()))
    return d 

In [29]:
def back(date):
    day = int(date[8:10])
    month = int(date[5:7])
    year = int(date[:4])
    fir = day - 2
    sec= day - 1
    months_30 = (7, 5, 11, 10)
    if fir > 0:
        return date[:8] + str(fir), date[:8] + str(sec)
    elif  fir == 0:
        if month in months_30:
            da = "-30"
        elif month == 3:
            if year % 4 == 0:
                da = "-29" 
            else:
                da = "-28"
        elif month == 1:
            return str(year - 1) +"-" +str_with_zero(12)+ "31", date[:8] + str_with_zero(sec)
        else:
            da = "-31"
        return str(year) +"-" +str_with_zero(month - 1)+ da, date[:8] + str_with_zero(sec)
    else:
        if month in months_30:
            da = "-29"
            db = "-30"
        elif month == 3:
            if year % 4 == 0:
                da = "-28" 
                db = "-29"
            else:
                da = "-27"
                db = "-28"
        elif month == 1:
            return str(year - 1) +"-" +str_with_zero(12)+ "-30", str(year - 1) +"-" +str_with_zero(12)+ "-31"
        else:
            da = "-30"
            db = "-31"
        return str(year) +"-" +str_with_zero(month - 1)+ da, str(year) +"-" +str_with_zero(month - 1)+ db

In [31]:
iii = len(cursor.execute('select id from sessions').fetchall()) + 1
for k, v in mori_theatres.items():
    url_sessions = url + v['data-id'] + "schedule.php"
    r = requests.get(url_sessions)
    if r.status_code == 200:
        f, s = "", ""
        dic_dates = dict()
        soup = BeautifulSoup(r.text, "html.parser")
        first = soup.find('li', class_='notactivebtn')
        second = soup.find('li', class_='activebtn') 
        if first and second:
            f = dates_my(first.text.strip().split())
            s = dates_my(second.text.strip().split())
            if f < s:
                today, tomorrow = back(f)
            else:
                today, tomorrow = back(s)
        dates = soup.findAll('li', class_='activebtn')
        dic_dates["1"] = dict()
        dic_dates["2"] = dict()
        dic_dates["1"]["data_num"] = today
        dic_dates["2"]["data_num"] = tomorrow
        for i in dates:
            dic_dates[i["data-date"]] = dict()
            dic_dates[i["data-date"]]["data_num"] = dates_my(i.text.strip().split())
        tables = soup.findAll('table', class_='tbl_timetable')
        for table_my in tables:
            try:
                da_now = table_my["data-table"]
            except KeyError:
                if table_my["data-day"] == "today":
                    da_now = "1"
                else:
                    da_now = "2" 
            if da_now in dic_dates.keys():
                sess_trs = table_my.findAll('tr')
                notchange = False
                dic_dates[da_now]["film"] = list() 
                for trs_my in sess_trs:
                    kvdmf = trs_my.findAll("th")
                    if (len(kvdmf) == 0) and (trs_my["class"] == [] or trs_my["class"][0] != "first"):
                        if trs_my.text.strip().split() == []:
                            notchange = False
                            #print(dic_film)
                            dic_dates[da_now]["film"].append(dic_film)
                            continue      
                        if notchange == False:
                            film_t = trs_my.find("td")
                            #print(da_now, film_t)
                            if film_t:
                                try:
                                    title = film_t.find("a").text.strip()
                                except:
                                    title = "No title"
                                dic_film = dict()
                                dic_film["title"] = title
                            notchange = True
                        film_all = trs_my.findAll("td")
                        for i, td in enumerate(film_all):
                            if len(film_all) == 3:
                                if i == 1:
                                    forma = td.text.strip()
                                    dic_film["format"] = dict()
                                if i == 2:
                                    time = td.findAll("a", class_="aActualShedule")
                                    dic_film["format"][forma] = [j.text.strip()[:5] for j in time]
                            else:
                                if i == 0:
                                    forma = td.text.strip()
                                if i == 1:
                                    time = td.findAll("a", class_="aActualShedule")
                                    dic_film["format"][forma] = [j.text.strip()[:5] for j in time]
        #print(dic_dates) 
        for jj, item in dic_dates.items():
            for fil in item['film']:
                cinema_id = get_id(fil['title'], "cinemas")
                hall_id = get_id(k, "cinema_halls")
                for form, time in fil['format'].items():
                    t = ", ".join(time)
                    tabl = cursor.execute('select cinema_id, hall_id, date, form, time, price from sessions').fetchall()
                    flag = False
                    for j in tabl:
                        if ({cinema_id}, {hall_id}, {item['data_num']}, {form}, {t}, {0}) in j:
                            flag = True
                    if not flag:
                        if str(cinema_id).isdigit(): 
                            cursor.execute(f"insert into sessions values ('{iii}', '{cinema_id}', '{hall_id}', '{item['data_num']}', '{form}', '{t}', '{0}')")
                            iii += 1
                            conn.commit()                    
    else:
        print("Страница не найдена")

# Викисинема

In [133]:
try:
    cursor.execute("insert into brand values (3, 'WikiCinema')")
    conn.commit()
except sqlite3.IntegrityError:
    print('Wiki уже добавлен!')

In [134]:
def find_all_theaters_viki(theatres):
    dicti = {}
    for theater in theatres:
        name = theater.find('div', class_="cinema-item-trc").text.strip()
        city = theater.find('div', class_="cinema-item-city").text.strip()
        href = theater.findAll('a')[0]["href"]
        dicti[name + "("+city+")"] = {'data-id': href}
        url_theater = dicti[name + "("+city+")"]['data-id'] + "kontakty/"
        r = requests.get(url_theater)
        if r.status_code == 200:
            soup = BeautifulSoup(r.text, "html.parser")
            dicti[name + "("+city+")"] = {
            'metro': None, 
            'address': soup.find('div', class_ ="adress").text.strip(),
            'phone': "".join(soup.find('div', class_ ="phone").text.strip().split()),
            'data-id': href
            }
        else:
            print("Страница не найдена")
    return dicti

In [64]:
url = "https://wikicinema.ru/"
r = requests.get(url)
if r.status_code == 200:
    soup = BeautifulSoup(r.text, "html.parser")
    theatres = soup.findAll('div', class_ ="cinema-item")
    viki_theatres = find_all_theaters_viki(theatres)
else:
    print("Страница не найдена")

{'Малибу(Липецк)': {'metro': None, 'address': 'Киноплекс «Малибу», г. Липецк, ул. Терешковой, д. 35 б', 'phone': '(4742)51-76-38', 'data-id': 'http://malibu.wikicinema.ru/'}, 'Парк Плаза(Электросталь)': {'metro': None, 'address': 'ул. Корешкова, 3, Электросталь, Московская обл.', 'phone': '', 'data-id': 'http://plaza.wikicinema.ru/'}, 'Выходной(Люберцы)': {'metro': None, 'address': 'г. Люберцы, Московская область, Октябрьский проспект, 112', 'phone': '+7(499)500-49-25', 'data-id': 'http://lubercy.wikicinema.ru/'}, 'ЗигЗаг(Москва)': {'metro': None, 'address': 'г. Москва, ул. Лобненская, д. 4а, ТРЦ «Зиг-Заг»', 'phone': '+7499290-37-09', 'data-id': 'http://zigzag.wikicinema.ru/'}}


In [137]:
last_id_cin = cursor.execute('select id from cinema_halls').fetchall()
n = len(last_id_cin)
for i, theater in viki_theatres.items():
    adress = theater['address']
    metro = theater['metro']
    phone = theater['phone']
    site_id = theater['data-id']
    cursor.execute(f"insert into cinema_halls values ('{n + 1}', {3}, '{site_id}', '{i}', '{adress}', '{metro}', '{phone}')")
    n += 1
    conn.commit()

In [138]:
jjj = len(cursor.execute('select id from sessions').fetchall()) + 1
for k, v in viki_theatres.items():
    url_sess = v['data-id'] 
    r = requests.get(url_sess)
    if r.status_code == 200:
        dic_dates = dict()
        soup = BeautifulSoup(r.text, "html.parser")
        days = soup.findAll('a', class_ ="afisha-day")
        for day in days:
            if not("disabled" in day["class"]):
                date = day["data-date"][6:]+"-"+day["data-date"][3:5]+"-"+day["data-date"][:2]
                dic_dates[date] = list()      
        d_sessions = soup.findAll('div', class_ ="afisha-seance-wrapper")
        for d_session in d_sessions:
            date = d_session["data-date"][6:]+"-"+d_session["data-date"][3:5]+"-"+d_session["data-date"][:2]
            films = d_session.findAll('div', class_ ="afisha-list-item")
            films_dic = dict()
            for film in films:
                films_dic[film.find("h3").text] = dict()
                films_dic[film.find("h3").text]["genre"] = film.find("p", class_="film-genre").text.rsplit(" ",1)[0]
                films_dic[film.find("h3").text]["age"] = film.find("p", class_="film-genre").text.rsplit(" ",1)[1]
                films_dic[film.find("h3").text]["format"] = dict()
                for ii in film.findAll("div", class_="film-seances-item"):
                    form = ii.find("li")["data-format"]
                    films_dic[film.find("h3").text]["format"][form.strip()] = list()
                for ii in film.findAll("div", class_="film-seances-item"):
                    form = ii.find("li")["data-format"]
                    films_dic[film.find("h3").text]["format"][form.strip()].append(ii.find("a", class_="seance").text)
            dic_dates[date].append(films_dic) 
            for jj, item in dic_dates.items():
                for film in item:
                    for name, info in film.items():
                        cinema_id = get_id(name, "cinemas")
                        hall_id = get_id(k, "cinema_halls")
                        for form, time in info['format'].items():
                            t = ", ".join(time)
                            tabl = cursor.execute('select cinema_id, hall_id, date, form, time, price from sessions').fetchall()
                            flag = False
                            for j in tabl:
                                if ({cinema_id}, {hall_id}, {jjj}, {form}, {t}, {0}) in j:
                                    flag = True
                            if not flag:
                                if str(cinema_id).isdigit(): 
                                    cursor.execute(f"insert into sessions values ('{jjj}', '{cinema_id}', '{hall_id}', '{jj}', '{form}', '{t}', '{0}')")
                                    jjj += 1
                                    conn.commit()    
    else:
        print("Страница не найдена")

{'2019-12-22': [{'Полицейский с Рублевки. Новогодний беспредел 2': {'genre': 'комедия', 'age': '12+', 'format': {'2D': ['09:10', '11:05', '13:40', '15:30', '17:20', '18:05', '19:10', '20:00', '21:00', '21:55', '22:50', '23:50']}}, 'Звёздные Войны: Скайуокер. Восход': {'genre': 'фантастика, экшн', 'age': '16+', 'format': {'2D': ['09:00', '10:50', '11:30', '14:10', '16:50', '17:40', '19:30', '21:05', '22:00'], '3D': ['15:25']}}, 'Джуманджи: Новый уровень': {'genre': 'фэнтези, боевик, комедия, приключения', 'age': '12+', 'format': {'2D': ['09:10', '11:30', '15:20', '17:20', '19:40'], '3D-Atmos': ['13:00']}}, 'Холодное сердце 2': {'genre': 'анимация, приключения, семейный', 'age': '6+', 'format': {'2D': ['11:05', '13:05', '15:05', '17:05', '19:05']}}, 'Чёрное рождество': {'genre': 'ужасы, триллер, детектив', 'age': '18+', 'format': {'2D': ['09:00', '13:30', '20:20', '22:20']}}, 'Фиксики против кработов': {'genre': 'анимация, приключения, комедия, музыкальный, семейный', 'age': '6+', 'forma