Projet de Webscrapping Chuck Norris

Objectifs : 
_ Scrapper le site chucknorrisfacts.net avec lecture automatique des pages
_ Stocker les données dans une BDD
_ Dataviz et requêtage

Bibliothèques python utilisées : 
_ psycopg2
_ pandas
_ pygal
_ re
_ request

In [None]:
import requests
import re
from bs4 import BeautifulSoup
import psycopg2, config

conn = psycopg2.connect(database="bdd_fdelys", user=config.user,password=config.passw, host='127.0.0.1') 
cur = conn.cursor()

# Traitement de chaque ligne: affichage & enregistrement
def traiteInfo(id, rate, vote, fact):
    print("%4d : %.2f %5d %s" % (id, rate, vote, fact))
    cur.execute("""INSERT INTO public."factChuck" VALUES (%s, %s) ON CONFLICT (id) DO NOTHING;""", (id, fact))
    cur.execute("""INSERT INTO public."rateChuck" VALUES (NOW()::Date, %s, %s, %s) ON CONFLICT DO NOTHING;""", (id, rate, vote))

# Definition de la procédure qui traite 1 page
def recupPage(page):
    url = "https://chucknorrisfacts.net/facts.php?page=%d" % (page)
    print("\nRécupération de %s" %(url))
    r = requests.get(url, headers={"User-Agent": "Mon navigateur perso d'ici"})
    soup = BeautifulSoup(r.content, 'html.parser')
    blocks = soup.select("#content > div:nth-of-type(n+2)")
    for block in blocks: 
        fact = block.select_one("p")
        if fact is not None:
            id = block.select_one("ul.star-rating").attrs['id']
            rate = block.select_one("span.out5Class")
            vote = block.select_one("span.votesClass")
            
            traiteInfo(int(id[6:]), float(rate.text), int(vote.text[:-6]), fact.text)

def lastPage():
    url = "https://chucknorrisfacts.net/facts"
    r = requests.get(url, headers={"User-Agent": "Mon navigateur perso d'ici"})
    soup = BeautifulSoup(r.content, 'html.parser')
    lastPage = soup.select("#content a:link")
    lastPageToStr = str(lastPage[-1].get('href'))   
    numPage = re.findall(r'\d+', lastPageToStr)
    return(int(numPage[0]))

for p in range(1, lastPage()+1):
    recupPage(page = p)

cur.execute("""SELECT COUNT(*) FROM "factChuck";""")
print(cur.fetchall())
conn.commit()
conn.close()

In [47]:
import config
import pandas as pd
import pygal
from pygal.style import RedBlueStyle

pd.set_option('display.max_colwidth', -1)

conn = psycopg2.connect(database="bdd_fdelys", user=config.user,password=config.passw, host='127.0.0.1') 
cur = conn.cursor()
sql = 'SELECT * FROM "factChuck"'
dfAll = pd.read_sql(sql,conn)
#print(len(dfAll))

#requêtes pour chaque répartition
rateUn = '''SELECT DISTINCT * FROM "factChuck" INNER JOIN "rateChuck" ON ("factChuck".id="rateChuck".id) WHERE date = '2020-12-16' AND rating BETWEEN 0.00 AND 2.50 ORDER BY rating'''
dfUn = pd.read_sql(rateUn,conn)
lenUn = (len(dfUn))

rateDeux = '''SELECT DISTINCT * FROM "factChuck" INNER JOIN "rateChuck" ON ("factChuck".id="rateChuck".id) WHERE date = '2020-12-16' AND rating BETWEEN 2.51 AND 3.50 ORDER BY rating'''
dfDeux = pd.read_sql(rateDeux,conn)
lenDeux = (len(dfDeux))

rateTrois = '''SELECT DISTINCT * FROM "factChuck" INNER JOIN "rateChuck" ON ("factChuck".id="rateChuck".id) WHERE date = '2020-12-16' AND rating BETWEEN 3.51 AND 4.00 ORDER BY rating'''
dfTrois = pd.read_sql(rateTrois,conn)
lenTrois = (len(dfTrois))

rateQuatre = '''SELECT DISTINCT * FROM "factChuck" INNER JOIN "rateChuck" ON ("factChuck".id="rateChuck".id) WHERE date = '2020-12-16' AND rating BETWEEN 4.01 AND 5.00 ORDER BY rating'''
dfQuatre = pd.read_sql(rateQuatre,conn)
lenQuatre = (len(dfQuatre))

#Graph de répartion par moyenne
pie_chart = pygal.Pie(style=RedBlueStyle)
pie_chart.title = 'Repartition des facts par moyenne - 5308 facts'
pie_chart.add('0.00-2.50 : 11%', lenUn)
pie_chart.add('2.51-3.50 : 52%', lenDeux)
pie_chart.add('3.51-4.00 : 31%', lenTrois)
pie_chart.add('4.01-5.00 : 6%', lenQuatre)
pie_chart.render_in_browser()



#requêtes pour nb de votes alt : SELECT DISTINCT * FROM "factChuck" INNER JOIN "rateChuck" ON ("factChuck".id="rateChuck".id) WHERE date = '2020-12-16' AND nbvotes BETWEEN 0 AND 500 ORDER BY nbvotes
voteUno = '''SELECT SUM(nbvotes) FROM "rateChuck" WHERE nbvotes BETWEEN 0 AND 500'''
dfUno = pd.read_sql(voteUno,conn)
Uno = dfUno.loc[:, "sum"]

voteDos = '''SELECT SUM(nbvotes) FROM "rateChuck" WHERE nbvotes BETWEEN 501 AND 2000'''
dfDos = pd.read_sql(voteDos,conn)
Dos = dfDos.loc[:, "sum"]

voteTres = '''SELECT SUM(nbvotes) FROM "rateChuck" WHERE nbvotes BETWEEN 2001 AND 4000'''
dfTres = pd.read_sql(voteTres,conn)
Tres = dfTres.loc[:, "sum"]

voteQuattro = '''SELECT SUM(nbvotes) FROM "rateChuck" WHERE nbvotes BETWEEN 4001 AND 6000'''
dfQuattro = pd.read_sql(voteQuattro,conn)
Quattro = dfQuattro.loc[:, "sum"]

voteCinqo = '''SELECT SUM(nbvotes) FROM "rateChuck" WHERE nbvotes BETWEEN 6001 AND 10000'''
dfCinqo = pd.read_sql(voteCinqo,conn)
Cinqo = dfCinqo.loc[:, "sum"]

voteTotal = '''SELECT SUM(nbvotes) FROM "rateChuck"'''
dfTotal = pd.read_sql(voteTotal,conn)
Total = dfTotal.loc[:, "sum"]

#Graph pour les votes
line_chart = pygal.HorizontalBar()
line_chart.title = 'nombre de votes totaux : 2.600.099'
line_chart.add('0-500 : 35%', Uno)
line_chart.add('501-2000 : 8%', Dos)
line_chart.add('2001-4000 : 26%', Tres)
line_chart.add('4001-6000 : 24%', Quattro)
line_chart.add('6001-10000 : 7%', Cinqo)
#line_chart.add('Total : 100%', Total)
line_chart.render_in_browser()

  pd.set_option('display.max_colwidth', -1)


file:///tmp/tmpclwyh599.html
file:///tmp/tmp3nrlh2hz.html


In [42]:
fight = """SELECT * FROM "factChuck" WHERE fact LIKE '%fight%' LIMIT 5"""
dffight = pd.read_sql(fight,conn)
print(dffight)

    id                                                                                                                                                             fact
0  262  In the early 70's Chuck Norris and Arnold Schwarzenegger got into a fight. With just one round house kick to the face, Arnold hasn't talked the same ever since
1  457  When Chuck Norris get in to fights all he has to do is say, "I win."                                                                                           
2  494  To commit suicide: Some people hang themselves, some people shoot themselves, some people people pick a fight with Chuck Norris.                               
3  507  It took the government 15 years to fight in Vietnam. Chuck Norris could have done it in 1                                                                      
4  911  Chuck Norris and Jack Bauer don't fight each other. They apocalypse each other.                                                                         

In [43]:
eat = """SELECT * FROM "factChuck" WHERE fact LIKE '%sleep%' LIMIT 5"""
dfeat = pd.read_sql(eat,conn)
print(dfeat)

    id                                                                                                                            fact
0  2    Chuck Norris doesnt sleep. He waits.                                                                                          
1  122  When the Boogeyman goes to sleep every night he checks his closet for Chuck Norris.                                           
2  129  Chuck Norris sleeps with a night light. Not because Chuck Norris is afraid of the dark, but the dark is afraid of Chuck Norris
3  137  The best part of waking up is not Folgers in your cup, but knowing that Chuck Norris didn't kill you in your sleep.           
4  201  Chuck Norris once ate an entire bottle of sleeping pills. They made him blink.                                                


In [44]:
heal = """SELECT * FROM "factChuck" WHERE fact LIKE '%hunt%' LIMIT 5"""
dfheal = pd.read_sql(heal,conn)
print(dfheal)

     id                                                                                                                                                         fact
0  125   Chuck Norris does not hunt because the word hunting implies the possibility of failure. Chuck Norris goes killing.                                         
1  876   Chuck Norris doesn't hunt for monster. Monsters give him free Exp in exchange for their lives                                                              
2  2321  When Chuck Norris goes hunting he doesn't bring his gun. The animals commit suicide.                                                                       
3  2535  Chuck Norris was the hunter who shot Bambi's Mother. He then wore her carcass like it was a coat while he made his rounds at the local children's hospital.
4  3690  Chuck Norris doesn't go hunting. He goes killing.                                                                                                          


In [30]:
heal = """SELECT * FROM "factChuck" WHERE fact LIKE '%health%' LIMIT 15"""
dfheal = pd.read_sql(heal,conn)
print(dfheal)

      id                                                                                                    fact
1  735    Chuck Norris IS the healthcare crisis.                                                                
2  2344   Chuck Norris is so healthy, that when he drinks green tea, the tea benefits, not him.                 
3  4471   Why is health care so expensive? Chuck Norris is sending thousands of people to the hospital everyday.
4  10691  If Chuck Norris is seen in a video game, his "health bar" is actually is his patience.                
