In [75]:
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
import numpy as np
from IPython.display import display

In [76]:
# Q1
def getsoup(url):
    return BeautifulSoup(requests.get(url).text,'html.parser')

In [77]:
# Q2 
class NonValide(Exception):
    pass

In [78]:
# Q3
def prix(soup):
    try:
        prix_tag=soup.find("p", class_="product-price")
        prix_text=prix_tag.text.replace("€", "").replace(" ", "").strip()
        prix=int(prix_text)
        if prix<10000:
            raise NonValide("Prix trop bas.")
        return prix
    except:
        raise NonValide("Prix non trouvé")

In [79]:
#Q4
def ville(soup):
    try:
        ville_tag=soup.find("h2", class_="mt-0")
        ville_text=ville_tag.text.strip()
        dernier_index=ville_text.rfind(", ")
        ville=ville_text[dernier_index+2:]
        return ville
    except:
        raise NonValide("Ville non trouvée.")

In [80]:
# Q5
def caracteristiques(soup):
    caracteristiques_balise = soup.find('p', class_='ad-section-title')
    if not caracteristiques_balise:
        raise NonValide("Caractéristiques non trouvées")
    
    return caracteristiques_balise.find_next("div") 

def type(soup):
    caracteristique = caracteristiques(soup)
    try:
        type_tag=caracteristique.find("span",string="Type")
        type=type_tag.find_next("span").text.strip()
        if type not in ["Maison","Appartement"]:
            raise NonValide("Type non valide.")
        return type
    except:
        raise NonValide("Type non trouvé.")

def surface(soup):
    caracteristique = caracteristiques(soup)
    surface_tag=caracteristique.find("span", string="Surface")
    if surface_tag:
        surface=surface_tag.find_next("span").text.replace("m²", "").strip()
        return surface
    return "-"

def nbrpieces(soup):
    caracteristique = caracteristiques(soup)
    nbrpieces_tag=caracteristique.find("span",string="Nb. de pièces")
    if nbrpieces_tag:
        nbrpieces=nbrpieces_tag.find_next("span").text.strip()
        return nbrpieces
    return "-"

def nbrchambres(soup):
    caracteristique = caracteristiques(soup)
    nbrchambres_tag=caracteristique.find("span",string="Nb. de chambres")
    if nbrchambres_tag:
        nbrchambres=nbrchambres_tag.find_next("span").text.strip()
        return nbrchambres
    return "-"

def nbrsdb(soup):
    caracteristique = caracteristiques(soup)
    nbrsdb_tag=caracteristique.find("span",string="Nb. de sales de bains")
    if nbrsdb_tag:
        nbrsdb=nbrsdb_tag.find_next("span").text.strip()
        return nbrsdb
    return "-"

def dpe(soup):
    caracteristique = caracteristiques(soup)
    dpe_tag = caracteristique.find("span", string="Consommation d'énergie (DPE)")
    if dpe_tag:
        dpe = dpe_tag.find_next("span").text.strip()
        if '(' in dpe:
            dpe = dpe.split('(')[0].strip()
        return dpe
    return "-"

In [81]:
# Q6
def informations(soup):
    try:
        ville_str = ville(soup)
        type_str = type(soup)
        surface_str = surface(soup)
        nbrpieces_str = nbrpieces(soup)
        nbrchambres_str = nbrchambres(soup)
        nbrsdb_str = nbrsdb(soup)
        dpe_str = dpe(soup)
        prix_str = prix(soup)

        return f"{ville_str},{type_str},{surface_str},{nbrpieces_str},{nbrchambres_str},{nbrsdb_str},{dpe_str},{prix_str}"
    except NonValide as e:
        raise NonValide(f"{e}")

In [82]:
#Q7
def get_max_page(soup):
    pagination = soup.select("ul.pagination li a")
    if pagination:
        nbDePages = [int(link.text) for link in pagination if link.text.isdigit()]
        return max(nbDePages) if nbDePages else 1
    return 1

def annonces_CSV():
    url = "https://www.immo-entre-particuliers.com/annonces/france-ile-de-france/vente"
    soup = getsoup(url)
    max_page = get_max_page(soup) 
    print(f"Total de pages : {max_page}")

    annonces_data = []
    annonces_urls = set()

    for page in range(1, max_page + 1): 
        page_url = f"https://www.immo-entre-particuliers.com/annonces/france-ile-de-france/vente/{page}"
        print(f"Page : {page}")
        soup = getsoup(page_url)

        annonces = soup.select('a[href^="/annonce-"]')
        
        for annonce in annonces:
            annonce_url = annonce['href']
            if not annonce_url.startswith("http"):
                annonce_url = "https://www.immo-entre-particuliers.com" + annonce_url
            
            if annonce_url in annonces_urls:
                continue  
            annonces_urls.add(annonce_url)

            try:
                annonce_soup = getsoup(annonce_url)  
                info = informations(annonce_soup)
                if info:
                    annonces_data.append(info)
                    print(f"{info}")
            except NonValide as e:
                print(f"Annonce invalide {annonce_url} : {e}")
                continue  

    if annonces_data:
        with open("annonces.csv", "w", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerow(["Ville", "Type", "Surface", "NbrPieces", "NbrChambres", "NbrSdb", "DPE", "Prix"])
            for row in annonces_data:
                writer.writerow(row.split(','))

        print("Fini")

annonces_CSV()

Total de pages : 128
Page : 1
Montreuil,Appartement,31,1,1,-,B,245000
Sainte-Mesme,Maison,136,7,4,1,D,375000
Saint-Cyr-sous-Dourdan,Maison,170,7,3,1,E,420000
Saint-Brice-sous-Forêt,Maison,108,5,4,1,D,409000
Neuilly-sur-Seine,Appartement,14,1,1,-,E,189000
Champs-sur-Marne,Maison,160,7,5,1,D,615000
Morangis,Maison,160,7,4,2,C,380000
Mantes-la-Jolie,Appartement,30,1,1,1,E,134500
La Ville-du-Bois,Maison,122,5,3,1,D,270000
Limeil-Brévannes,Maison,121,5,3,1,D,455000
Morsang-sur-Orge,Maison,118,6,3,2,D,385000
Bussy-Saint-Georges,Appartement,28,2,1,-,D,165300
Paris 10ème,Appartement,19,1,1,-,F,212000
Neuilly-sur-Marne,Appartement,90,4,3,1,C,275000
Annonce invalide https://www.immo-entre-particuliers.com/annonce-paris-paris-2eme/409087-recherche-a-lachat-locaux-pour-une-activite-de-commerce : Type non trouvé.
Page : 2
Annonce invalide https://www.immo-entre-particuliers.com/annonce-paris-paris-1er/409078-recherche-a-lachat-hotels : Type non trouvé.
La Garenne-Colombes,Appartement,77,3,2,1,F,500

In [83]:
#Q8
annonces = pd.read_csv("annonces.csv")
display(annonces)

Unnamed: 0,Ville,Type,Surface,NbrPieces,NbrChambres,NbrSdb,DPE,Prix
0,Montreuil,Appartement,31,1,1,-,B,245000
1,Sainte-Mesme,Maison,136,7,4,1,D,375000
2,Saint-Cyr-sous-Dourdan,Maison,170,7,3,1,E,420000
3,Saint-Brice-sous-Forêt,Maison,108,5,4,1,D,409000
4,Neuilly-sur-Seine,Appartement,14,1,1,-,E,189000
...,...,...,...,...,...,...,...,...
427,Nogent-sur-Marne,Appartement,76,4,3,1,-,280000
428,Chelles,Maison,150,-,4,2,-,200000
429,Sarcelles,Maison,-,5,3,2,-,345000
430,Pontoise,Appartement,131,6,5,2,-,250000


In [84]:
#Q9
annonces["DPE"] = annonces["DPE"].replace("-", "Vierge")
display(annonces)

Unnamed: 0,Ville,Type,Surface,NbrPieces,NbrChambres,NbrSdb,DPE,Prix
0,Montreuil,Appartement,31,1,1,-,B,245000
1,Sainte-Mesme,Maison,136,7,4,1,D,375000
2,Saint-Cyr-sous-Dourdan,Maison,170,7,3,1,E,420000
3,Saint-Brice-sous-Forêt,Maison,108,5,4,1,D,409000
4,Neuilly-sur-Seine,Appartement,14,1,1,-,E,189000
...,...,...,...,...,...,...,...,...
427,Nogent-sur-Marne,Appartement,76,4,3,1,Vierge,280000
428,Chelles,Maison,150,-,4,2,Vierge,200000
429,Sarcelles,Maison,-,5,3,2,Vierge,345000
430,Pontoise,Appartement,131,6,5,2,Vierge,250000


In [85]:
#Q10
cols = ["Surface", "NbrPieces", "NbrChambres", "NbrSdb"]

annonces[cols] = annonces[cols].replace("-", np.nan)
annonces[cols] = annonces[cols].astype(float)
moyenne = annonces[cols].mean()
annonces[cols] = annonces[cols].fillna(moyenne)

annonces.dropna(inplace=True)

print(moyenne)
display(annonces)

Surface        94.518868
NbrPieces       5.936321
NbrChambres     2.558313
NbrSdb          1.323684
dtype: float64


Unnamed: 0,Ville,Type,Surface,NbrPieces,NbrChambres,NbrSdb,DPE,Prix
0,Montreuil,Appartement,31.000000,1.000000,1.0,1.323684,B,245000
1,Sainte-Mesme,Maison,136.000000,7.000000,4.0,1.000000,D,375000
2,Saint-Cyr-sous-Dourdan,Maison,170.000000,7.000000,3.0,1.000000,E,420000
3,Saint-Brice-sous-Forêt,Maison,108.000000,5.000000,4.0,1.000000,D,409000
4,Neuilly-sur-Seine,Appartement,14.000000,1.000000,1.0,1.323684,E,189000
...,...,...,...,...,...,...,...,...
427,Nogent-sur-Marne,Appartement,76.000000,4.000000,3.0,1.000000,Vierge,280000
428,Chelles,Maison,150.000000,5.936321,4.0,2.000000,Vierge,200000
429,Sarcelles,Maison,94.518868,5.000000,3.0,2.000000,Vierge,345000
430,Pontoise,Appartement,131.000000,6.000000,5.0,2.000000,Vierge,250000


In [86]:
#Q11
annonces = pd.get_dummies(annonces, columns=["Type", "DPE"], dtype='int')
display(annonces)

Unnamed: 0,Ville,Surface,NbrPieces,NbrChambres,NbrSdb,Prix,Type_Appartement,Type_Maison,DPE_A,DPE_B,DPE_C,DPE_D,DPE_E,DPE_F,DPE_Vierge
0,Montreuil,31.000000,1.000000,1.0,1.323684,245000,1,0,0,1,0,0,0,0,0
1,Sainte-Mesme,136.000000,7.000000,4.0,1.000000,375000,0,1,0,0,0,1,0,0,0
2,Saint-Cyr-sous-Dourdan,170.000000,7.000000,3.0,1.000000,420000,0,1,0,0,0,0,1,0,0
3,Saint-Brice-sous-Forêt,108.000000,5.000000,4.0,1.000000,409000,0,1,0,0,0,1,0,0,0
4,Neuilly-sur-Seine,14.000000,1.000000,1.0,1.323684,189000,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
427,Nogent-sur-Marne,76.000000,4.000000,3.0,1.000000,280000,1,0,0,0,0,0,0,0,1
428,Chelles,150.000000,5.936321,4.0,2.000000,200000,0,1,0,0,0,0,0,0,1
429,Sarcelles,94.518868,5.000000,3.0,2.000000,345000,0,1,0,0,0,0,0,0,1
430,Pontoise,131.000000,6.000000,5.0,2.000000,250000,1,0,0,0,0,0,0,0,1


In [87]:
#Q12
villes = pd.read_csv("cities.csv")

In [88]:
#Q13
villes = pd.read_csv("cities.csv")
annonces["Ville"] = (
    annonces["Ville"]
    .str.lower()  
    .str.replace("-", " ", regex=True) 
    .str.replace("'", "", regex=True)  
    .str.replace(r"[éèê]", "e", regex=True) 
    .str.replace(r"[ô]", "o", regex=True)  
    .str.replace(r"[î]", "i", regex=True)  
    .str.replace(r"[û]", "u", regex=True)  
    .str.replace(r"[à]", "a", regex=True)  
)
display(annonces)

Unnamed: 0,Ville,Surface,NbrPieces,NbrChambres,NbrSdb,Prix,Type_Appartement,Type_Maison,DPE_A,DPE_B,DPE_C,DPE_D,DPE_E,DPE_F,DPE_Vierge
0,montreuil,31.000000,1.000000,1.0,1.323684,245000,1,0,0,1,0,0,0,0,0
1,sainte mesme,136.000000,7.000000,4.0,1.000000,375000,0,1,0,0,0,1,0,0,0
2,saint cyr sous dourdan,170.000000,7.000000,3.0,1.000000,420000,0,1,0,0,0,0,1,0,0
3,saint brice sous foret,108.000000,5.000000,4.0,1.000000,409000,0,1,0,0,0,1,0,0,0
4,neuilly sur seine,14.000000,1.000000,1.0,1.323684,189000,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
427,nogent sur marne,76.000000,4.000000,3.0,1.000000,280000,1,0,0,0,0,0,0,0,1
428,chelles,150.000000,5.936321,4.0,2.000000,200000,0,1,0,0,0,0,0,0,1
429,sarcelles,94.518868,5.000000,3.0,2.000000,345000,0,1,0,0,0,0,0,0,1
430,pontoise,131.000000,6.000000,5.0,2.000000,250000,1,0,0,0,0,0,0,0,1


In [89]:
villes["label"] = (
    villes["label"]
    .str.lower()
    .str.replace("-", " ", regex=True)
    .str.replace("'", "", regex=True)
    .str.replace(r"[éèê]", "e", regex=True)
    .str.replace(r"[ô]", "o", regex=True)
    .str.replace(r"[î]", "i", regex=True)
    .str.replace(r"[û]", "u", regex=True)
    .str.replace(r"[à]", "a", regex=True)
)
villes.to_csv("cities.csv", index=False)
display(villes)

Unnamed: 0,insee_code,city_code,zip_code,label,latitude,longitude,department_name,department_number,region_name,region_geojson_name
0,25620,ville du pont,25650,ville du pont,46.999873,6.498147,doubs,25,bourgogne-franche-comté,Bourgogne-Franche-Comté
1,25624,villers grelot,25640,villers grelot,47.361512,6.235167,doubs,25,bourgogne-franche-comté,Bourgogne-Franche-Comté
2,25615,villars les blamont,25310,villars les blamont,47.368384,6.871415,doubs,25,bourgogne-franche-comté,Bourgogne-Franche-Comté
3,25619,les villedieu,25240,les villedieu,46.713906,6.265831,doubs,25,bourgogne-franche-comté,Bourgogne-Franche-Comté
4,25622,villers buzon,25170,villers buzon,47.228558,5.852187,doubs,25,bourgogne-franche-comté,Bourgogne-Franche-Comté
...,...,...,...,...,...,...,...,...,...,...
39140,98829,thio,98829,thio,,,nouvelle-calédonie,988,nouvelle-calédonie,Nouvelle Calédonie
39141,98831,voh,98833,voh,,,nouvelle-calédonie,988,nouvelle-calédonie,Nouvelle Calédonie
39142,98832,yate,98834,yate,,,nouvelle-calédonie,988,nouvelle-calédonie,Nouvelle Calédonie
39143,98612,sigave,98620,sigave,-14.270411,-178.155263,wallis-et-futuna,986,wallis-et-futuna,Wallis-et-Futuna


In [90]:
#Q14
annonces = annonces.merge(villes[['label', 'latitude', 'longitude']], left_on="Ville", right_on="label")

annonces.drop(columns=["Ville", "label"], inplace=True)

annonces.to_csv("annonces.csv", index=False)
display(annonces)

Unnamed: 0,Surface,NbrPieces,NbrChambres,NbrSdb,Prix,Type_Appartement,Type_Maison,DPE_A,DPE_B,DPE_C,DPE_D,DPE_E,DPE_F,DPE_Vierge,latitude,longitude
0,31.000000,1.000000,1.0,1.323684,245000,1,0,0,1,0,0,0,0,0,48.772115,1.380661
1,31.000000,1.000000,1.0,1.323684,245000,1,0,0,1,0,0,0,0,0,50.465534,1.771872
2,31.000000,1.000000,1.0,1.323684,245000,1,0,0,1,0,0,0,0,0,46.397028,-0.841074
3,31.000000,1.000000,1.0,1.323684,245000,1,0,0,1,0,0,0,0,0,48.863513,2.448636
4,14.000000,1.000000,1.0,1.323684,189000,1,0,0,0,0,0,1,0,0,48.885662,2.266596
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
372,150.000000,5.936321,4.0,2.000000,200000,0,1,0,0,0,0,0,0,1,48.883978,2.597093
373,94.518868,5.000000,3.0,2.000000,345000,0,1,0,0,0,0,0,0,1,48.990237,2.381117
374,131.000000,6.000000,5.0,2.000000,250000,1,0,0,0,0,0,0,0,1,49.051578,2.094574
375,131.000000,6.000000,5.0,2.000000,250000,1,0,0,0,0,0,0,0,1,49.051578,2.094574
