# Investigando a evolução do IBOPE das emissoras brasileiras

Neste trabalho buscamos determinar como evoluiu a audiência das emissoras de televisão aberta nos últimos anos. Para isto utilizaremos os dados disponíveis na página [Kantar Ibope Media](https://www.kantaribopemedia.com/).

pseudo-código:

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup
import urllib
import time

import matplotlib.pyplot as plt
plt.style.use('seaborn-darkgrid')

In [None]:
# Get_URLs:
domain = 'https://www.kantaribopemedia.com'

link_list = []
#complete_data = []

def get_link (link):
    page = urllib.request.urlopen(link)
    soup = BeautifulSoup(page)
    article = soup.find_all('article')
    links = []
    for i in range(len(article)):
        path = article[i].h2.a.get('href')
        links.append(path)
    
    return links


# Scrapper:
def scrapper(link, domain):
    complete_link = domain + link
    page = urllib.request.urlopen(complete_link)
    soup = BeautifulSoup(page)
    
    day_raw = soup.h1.contents[0][-10:]
    
    day_raw = test_fmt_data(day_raw, soup)
    
    day = datetime.strptime(day_raw, '%d/%m/%Y')
    print(day)
    tables = soup.find_all('table')
    
    channels = ['Band', 'Globo', 'Record', 'SBT']
    
    data, cols = channel_finder(channels, tables, day)
    
    return data, cols


# Get the information from each channel
def channel_finder(ch_list, tables, day):
    c = 0
    data = []
    for chn in  ch_list:
        if chn == 'Band':
            c = 0
        elif chn == 'Globo':
            c = 1
        elif chn == 'Record':
            c = 2
        elif chn == 'RedeTV':
            c = 3
        elif chn == 'SBT':
            c = 4
        else:
            raise Exception('ERRO! {} não é um canal válido'.format(chn.upper()))
            
        try:
            find_chn = tables[c].find_all('tr')
        except IndexError:
            pass

        channel_name = str(find_chn[0].find('th').contents[0]).upper()
        cols = [str(i.contents[0]) for i in find_chn[2].find_all('td')[0:3]]
        cols.insert(0, 'Programa')
        cols.insert(0, 'Data')
        cols.insert(0, 'Canal')

        values = []
        programs = []
        for j in find_chn[4:]:
            try:
                name = [str(i).upper() for i in j.find_all('td')[0]]
                num = [float(str(i.contents[0]).replace('.', '').replace(',','.')) for i in j.find_all('td')[1:4]]
                values.append(num)
                programs.append(name)
            except IndexError:
                name = ['ERROR']
                num = [np.nan, np.nan, np.nan]
                programs.append(name)
                values.append(num)
                pass
            except ValueError:
                pass

        for i in range(len(values)):
            values[i].insert(0, programs[i][0])
            values[i].insert(0, day)
            values[i].insert(0, channel_name)
        
        data.extend(values)    
    
    return data, cols


def test_fmt_data(day_raw, soup):
    if 'a' in day_raw:
        raw = day_raw[-5:]
        raw_month = int(raw[3:5])
        
        post = str(soup.time.contents[0])
        post_month = int(post[3:5])
        post_year = int(post[-4:])
        
        if post_month == raw_month:
            raw = raw + '/' + str(post_year)
            return raw
            
        elif post_month < raw_month:
            year = post_year - 1
            raw = raw + '/' + str(year)
            return raw
        
        elif post_month > raw_month:
            raw = raw + '/' + str(post_year)
            return raw
            #raise Exception('ERRO: O mês do post ({}) é maior que o mês de análise({})'.format(post_month, raw_month))
    else:
        return day_raw
    

for PAGE in range(12,23):
    link  = 'https://www.kantaribopemedia.com/conteudo/dados-rankings/audiencia-tv-15-mercados/page/{}/'.format(PAGE)
    list_element = get_link(link)
    link_list.append(list_element)
    time.sleep(1)


count = 0    

for line_link in link_list:
    
    for link in line_link:
        part_data, columns = scrapper(link, domain)
        complete_data.extend(part_data)
    
    count += 1
    print('Line{}'.format(count))
    time.sleep(5)
    
print('DONE!')
#print(complete_data)

In [None]:
df = pd.DataFrame(complete_data, columns=columns)
means = df.pivot_table(index = ['Canal','Data'], values=['Audiência Domiciliar','Audiência Individual', 'COV % Individual'])

In [None]:
means['Audiência Individual']['SBT']

In [None]:
%matplotlib notebook

means['Audiência Individual']['GLOBO'][:170].plot(label = 'GLOBO INDV.');
means['Audiência Domiciliar']['GLOBO'][:170].plot(label = 'GLOBO DOM.');
means['COV % Individual']['GLOBO'][:170].plot(label = 'GLOBO COV');
#means['Audiência Domiciliar']['GLOBO'][:170].plot(label = 'GLOBO');

#ma_4w.plot(label = 'Globo média móvel')
#means['Audiência Individual']['RECORD'][:170].plot( label = 'RECORD');
#means['Audiência Individual']['SBT'][:170].plot( label = 'SBT');
plt.legend(prop = {'size': 10});

#means['Audiência Domiciliar']['GLOBO'].nlargest(5)

In [None]:
errado_page = urllib.request.urlopen('https://www.kantaribopemedia.com/dados-de-audiencia-nas-15-pracas-regulares-com-base-no-ranking-consolidado-1604-a-2204/')
certo_page = urllib.request.urlopen('https://www.kantaribopemedia.com/dados-de-audiencia-nas-15-pracas-regulares-com-base-no-ranking-consolidado-1510-a-2110/')
certo_soup = BeautifulSoup(certo_page)
errado_soup = BeautifulSoup(errado_page)

In [None]:
tables = errado_soup.find_all('table')
for c in range(0,4):
    find_chn = tables[c].find_all('tr')
    values = []
    programs = []
    print(c)
    name = 0
    for j in find_chn[4:]:
        name = [str(i).upper() for i in j.find_all('td')[0]]
        print(name)
        num = [float(str(i.contents[0]).replace('.', '').replace(',','.')) for i in j.find_all('td')[1:4]]
        print(num)
        values.append(num)
        programs.append(name)


In [None]:
certo_soup.find_all('table')[4].find_all('tr')[4:][9].find_all('td')[0]

In [None]:
len(errado_soup.find_all('table'))#[4].find_all('tr')[4:][9].find_all('td')[0])

### 2018-10-14 00:00:00