# Introdução à Ciência de Dados - Trabalho Módulo 1

## Criando um dataset de carros usados coletados a partir da OLX

### Sumário

1. [Definição do problema](#Definição-do-problema)
2. [Extraindo as urls de cada página](#Extrair-as-urls-dos-carros-de-cada-página-da-OLX)
3. [Extraindo as informações de cada carro a partir das urls](#Extraindo-as-informações-de-cada-carro-a-partir-das-urls)
    - [Usando um único processo](#Single-Thread-Approach)
    - [Usando múltiplos processo (bem mais rápido)](#Single-Thread-Approach)
4. [Salvando o dataset]()
5. [Analises iniciais do dataset]()
6. [Filtrando amostras]()

## Definição do problema


lorem ipsum etc

In [1]:
import pandas as pd
import numpy as np
import json
from bs4 import BeautifulSoup
import requests
import os
from tqdm import tqdm

In [2]:
BASE_URL = 'https://www.olx.com.br/autos-e-pecas/carros-vans-e-utilitarios'
HEADERS = {
        'authority': 'olx.com.br',
        'method': 'GET',
        'scheme': 'https',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-user': '?1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
}

In [None]:
def collect_links(n_pages=5):    
    total_pages = get_number_pages()
    
    if n_pages == 'all' or n_pages > total_pages:
        n_pages = total_pages

    for page in tqdm(range(n_pages)):
        url = f'{BASE_URL}?o={page+1}'
        response = requests.get(url=url, headers=HEADERS)
        
        soup = BeautifulSoup(response.content, 'lxml')
        items = soup.find_all('li', {'class': 'sc-1fcmfeb-2 juiJqh'})
        
        for item in items:
            try:
                with open('links.txt', 'a') as outfile:
                    outfile.write(item.findAll('a')[0]['href']+'\n')
            except IndexError:
                continue

In [None]:
def get_number_pages():
    response = requests.get(url=BASE_URL, headers=HEADERS)
    soup = BeautifulSoup(response.content, 'lxml')
    results = soup.findAll('span', {'class': 'sc-1mi5vq6-0 eDXljX sc-ifAKCX fhJlIo'})

    ITEMS_PER_PAGE = 50
    n_cars_found = float(results[0].contents[0].split('de ')[-1].split(' ')[0].replace('.', ''))

    return int(n_cars_found/ITEMS_PER_PAGE)

In [3]:
def extract_car_info(link):
    response = requests.get(url=link, headers=HEADERS)
    soup = BeautifulSoup(response.content, 'lxml')

    items = soup.find_all('div', {'class': 'sc-hmzhuo eNZSNe sc-jTzLTM iwtnNi'})

    car = {}
    # Caracteristics
    for item in items:
        title = item.find_all('span', {'class': 'sc-ifAKCX dCObfG'})[0].contents[0]
        try:
            value = item.find_all('span', {'class': 'sc-ifAKCX cmFKIN'})[0].contents[0]
        except:
            value = item.find_all('a', {'class': 'sc-57pm5w-0 XtcoW'})[0].contents[0]

        car[title] = value

    locations = soup.find_all('div', {'class': 'sc-hmzhuo sc-1f2ug0x-3 ONRJp sc-jTzLTM iwtnNi'})

    # Location
    for location in locations:
        title = location.find_all('dt', {'class': 'sc-1f2ug0x-0 cLGFbW sc-ifAKCX cmFKIN'})[0].contents[0]
        try:
            value = location.find_all('dd', {'class': 'sc-1f2ug0x-1 ljYeKO sc-ifAKCX kaNiaQ'})[0].contents[0]
        except:
            value = None

        car[title] = value

    car['url'] = link.strip()
    return car

In [None]:
def proccess_links(file):
    cars = []
    with open('links.txt', 'r') as infile:
        links = infile.readlines()
        
        for link in tqdm(links):
            car = extract_car_info(link)
            cars.append(car)

    df = pd.DataFrame(cars)
    df.to_csv('olx_cars.csv')

## Extrair as urls dos carros de cada página da OLX

In [4]:
links_file = 'links.txt'
if not os.path.exists(links_file):
    collect_links(n_pages='all')

## Extraindo as informações de cada carro a partir das urls

### Single Thread Approach

In [None]:
proccess_links(links_file)

### Multithread approach

In [5]:
import sys
sys.setrecursionlimit(10000)

In [None]:
from multiprocessing import Pool

urls = []
with open(links_file, 'r') as infile:
    urls = infile.readlines()

urls = urls[:100000]

chunk_size = 8000
for i in range(0, len(urls), chunk_size):
    print(i)
    url_chunk = urls[i:i+chunk_size]
    pool = Pool(12)
    results = []
    for result in tqdm(pool.imap_unordered(extract_car_info, url_chunk), total=len(url_chunk)):
        results.append(result)

    df = pd.DataFrame(results)
    df.to_csv(f'olx_cars_{i}.csv')
    
    del results
    del df
    del url_chunk
    
    pool.terminate()
    pool.join()
    
    del pool

0


100%|██████████| 8000/8000 [07:48<00:00, 17.07it/s]


8000


  7%|▋         | 594/8000 [00:18<03:19, 37.14it/s]