Skip to content
This repository has been archived by the owner on Oct 1, 2018. It is now read-only.

Scraping Artesania #10

Closed
UlisesGascon opened this issue Jun 19, 2018 · 3 comments
Closed

Scraping Artesania #10

UlisesGascon opened this issue Jun 19, 2018 · 3 comments
Assignees
Labels

Comments

@UlisesGascon
Copy link
Contributor

UlisesGascon commented Jun 19, 2018

Duda sobre Scraping

img

@UlisesGascon
Copy link
Contributor Author

UlisesGascon commented Jun 20, 2018

@Craftmeet Este sería el Script en Nodejs. Mañana lo comentamos en clase para entender un poco más el tema del scraping. En total tenemos 1326, quedaría retocar un poco algunos campos para hacerlo más interesante ;-)

const puppeteer = require('puppeteer'),
    uuidv4 = require('uuid/v4'),
    fs = require("fs");

function totalTime(origin){
    const diffMs = (new Date() - origin); // milliseconds between now & origin
    const diffDays = Math.floor(diffMs / 86400000);
    const diffHrs = Math.floor((diffMs % 86400000) / 3600000);
    const diffMins = Math.round(((diffMs % 86400000) % 3600000) / 60000);
    return `Execution in total: ${diffDays} days, ${diffHrs} hours and ${diffMins} minutes`;
}

function pageScraper(pageNumber) {
    return new Promise((resolve, reject) => {
        let scrape = async () => {
            console.log("[INFO][Artesanos][Scraper][Page] Just started page:", pageNumber);
            const url = `http://www.juntadeandalucia.es/turismoydeporte/opencms/system/modules/com.saga.ctc.application/elements/registro-artesanos/jsonRegistro.jsp?idP=0&idM=0&idO=0&idT=0&pagina=${pageNumber}`;
            const browser = await puppeteer.launch();
            const page = await browser.newPage();
            await page.goto(url);
            const result = await page.evaluate(() => {
                const artesanosTabla = document.querySelectorAll("#report > tbody > tr");
                
                let currentArtesano = 0;
                const artesanos = [];
                for (let i = 1; i <= artesanosTabla.length-1; i++) {
                	const elementoActual = artesanosTabla[i];
                	if(i % 2 !== 0) {
                		const bloqueTitulos = elementoActual.querySelectorAll("td");
                		artesanos[currentArtesano]= {
                			"Artesano": bloqueTitulos[1].innerText,
                			"Nombre comercial": bloqueTitulos[2].innerText,
                			"Provincia": bloqueTitulos[3].innerText
                		};
                	} else {
                		const bloqueDetalles = elementoActual.querySelectorAll("ul.no-margin > li");
                		for (let j = 1; j <= bloqueDetalles.length-1; j++) {
                			const textoRaw = bloqueDetalles[j].innerText.split(":");
                			artesanos[currentArtesano][textoRaw[0]] = textoRaw[1];
                		}
                		currentArtesano++;
                	}
                
                }
                return artesanos;

            });

            await browser.close();
            return result;
        };

        scrape().then((values) => {
            // Add UUID
            values.map(item => {
                item.uuid = uuidv4();
                return item;
            });
            
            resolve(values)
            console.log("[INFO][Artesanos][Scraper][Page] Just finished page:", pageNumber);
        }).catch(err => {
            console.log("[ERROR][Artesanos][Scraper][Page] Error:", pageNumber);
            reject(err)
        });
    })
}


function scraper () {
    console.log("[INFO][Artesanos][Scraper] Starting now....");
    const startTime = new Date(); 
    
    const paginationScraper = async () => {
        const browser = await puppeteer.launch();
        const page = await browser.newPage();
        await page.goto('http://www.juntadeandalucia.es/turismoydeporte/opencms/system/modules/com.saga.ctc.application/elements/registro-artesanos/jsonRegistro.jsp');
        const result = await page.evaluate(() => {
            return document.querySelectorAll(".pagination > li").length;
        });
    
        await browser.close();
        return result;
    };
    
    paginationScraper().then((totalPages) => {
        console.log("[INFO][Artesanos][Scraper] Total pages to scrap for pagination:", totalPages);
        let paginationData = [];
        
        (async () => {
            for(let i = 1; i <= totalPages; i++){
                let page = await pageScraper(i);
                paginationData.push(page);
            }
            const cleanData = [].concat(...paginationData);
    
            fs.writeFileSync(`${__dirname}/data/artesanos.json`, JSON.stringify(cleanData));
            console.log("[INFO][Artesanos][Scraper] Ended sucesfully. All data is now in the database");
            console.log("[INFO][Artesanos][Scraper] Ended sucesfully. All data is now in the database");
            console.log("[INFO][Artesanos][Scraper]", totalTime(startTime));
        })();
    }).catch(console.log);
}

scraper();

@UlisesGascon
Copy link
Contributor Author

Resultados completos en formatos

Formato: JSON

[{
    "Artesano": "Alberto Yepes Gandullo",
    "Nombre comercial": "CERAMICA EL AVION",
    "Provincia": " CÓRDOBA",
    "Dirección": " CALLE/ Santa Ana nº s/n",
    "Teléfono": " 957684554",
    "Fax": " 957684554",
    "Email": " ceramicaelavion@gmail.com",
    "Municipio": " RAMBLA, LA",
    "uuid": "01bdbe62-bc86-49bb-bd85-8cb4f9d56d9e"
}]

Formato: CSV

Artesano,Nombre comercial,Provincia,Dirección,Teléfono,Fax,Email,Municipio,uuid,Otras actividades,Premios
Alberto Yepes Gandullo,CERAMICA EL AVION, CÓRDOBA, CALLE/ Santa Ana nº s/n, 957684554, 957684554, ceramicaelavion@gmail.com," RAMBLA, LA",01bdbe62-bc86-49bb-bd85-8cb4f9d56d9e,,

ZIP con todos los datos

@Craftmeet
Copy link
Collaborator

Cómo mola!! eres un artista. A ver si me "pispo" de algo en la explicación de mañana

Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
Projects
None yet
Development

No branches or pull requests

2 participants