# Duplicate Real Estate Listings 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import json
from tqdm import tqdm
import os
import uuid
import hashlib
from PIL import Image
import requests
from io import BytesIO
import glob
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme({
    "valid": "bold green encircle",
    "info": "dim cyan",
    "warning": "magenta",
    "danger": "bold red"
})
console = Console(theme=custom_theme)


pd.options.display.max_rows=50
pd.options.display.max_columns=90
df = pd.read_csv("data/Dataset - Ads _ Levallois-Perret - 2019-08 - export-ads-levallois-perret-2019-08-27.csv")

## 1. EDA Phase.

In [2]:
for type in df.dtypes.unique():
    same_dtype_columns = df.select_dtypes(include=type).columns
    console.print(f"\n[bold]Data type[/bold] : {type} \n[bold]Columns[/bold] : {list(same_dtype_columns)} \n")

In [3]:
def style_max(v, props=''):
    return props if v==100 else None

def show_na(df):
    na_cols = (pd.Series(df.isna().sum()/len(df.index)*100).reset_index().
        rename(columns={'index':'Feature',0:'Missing_Values'}).
        sort_values(by=['Missing_Values'], ascending=[False]))
    na_cols["Missing_Values"] = na_cols.apply(lambda row: round(row["Missing_Values"],2), axis=1)
    na_cols=na_cols[na_cols["Missing_Values"]>0]
    na_cols = na_cols.set_index('Feature')

    return na_cols

cm = sns.light_palette("red", as_cmap=True)

nas_stat = show_na(df)
nas_stat.style.set_caption("Pourcentage of missing values for each Feauture in the dataset")\
              .background_gradient(cmap=cm).applymap(style_max, props='color:black;')


Unnamed: 0_level_0,Missing_Values
Feature,Unnamed: 1_level_1
TOILET_COUNT,100.0
GREENHOUSE_GAS_CONSUMPTION,100.0
PUBLICATION_END_DATE,100.0
AIR_CONDITIONING,100.0
FIREPLACE,100.0
CELLAR,100.0
LUNCHROOM_COUNT,100.0
BATHROOM_COUNT,100.0
BALCONY_SURFACE,100.0
BALCONY,100.0


In [4]:
empty_columns = nas_stat[nas_stat["Missing_Values"]==100].index
df.drop(columns=empty_columns, inplace=True)

In [5]:

def style_unique(v, props='', n=2164):
    return props if v==n else None

def style_constant(v, props=''):
    return props if v==1 else None

def show_card(df):
    df = (pd.Series({feature: len(df[feature].unique()) for feature in df.columns})
                        .reset_index().rename(columns={'index':'Feature',0:'Cardinality'}))
    df =df.set_index('Feature')
    return df.sort_values(by ='Cardinality', ascending=False )

card_stat = show_card(df)
card_stat.style.set_caption("Cardinality of each feature in the dataset").applymap(style_constant, props='background-color:red;color:white')\
    .applymap(style_unique, props='background-color:green;color:white', n=len(df))

Unnamed: 0_level_0,Cardinality
Feature,Unnamed: 1_level_1
ID,2164
URL,2164
LAST_CRAWL_DATE,2164
PUBLICATION_START_DATE,2103
IMAGES,1947
DESCRIPTION,1915
PRICE_EVENTS,1733
PRICE_M2,998
PRICE,765
DEALER_NAME,554


In [7]:
def download_image(url):
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))
    return img

In [8]:
def generate_unique_identifier(input_string):
    # Using SHA256 as the hashing algorithm
    hash_object = hashlib.sha256(input_string.encode())
    unique_identifier = hash_object.hexdigest()
    return unique_identifier

def download_df_images_locally(df, images_column_name = "IMAGES", id_column_name = "ID", export_path ="data/images/" ):

    for i, row in tqdm(df.iterrows(), total=df.shape[0]):

        # parsing the array
        array_urls = json.loads(row[images_column_name])


        directory = f"{export_path}{row[id_column_name]}"

        if not os.path.exists(directory):
            os.makedirs(directory)

        # downloading the images
        for j, url in enumerate(array_urls):
            try : 
                img_name = generate_unique_identifier(url)
                save_image_path = f"{directory}/{img_name}.png"
                if os.path.exists(save_image_path):
                    continue
                else:
                    img = download_image(url)
                    img.save(save_image_path)                
            except Exception as e :
                continue
    



In [9]:
# download_df_images_locally(df)

### ii. Dataset cleaning

In [10]:
## Droping constant columns : 

In [11]:
df.drop(columns = ["CITY_ID", "CITY", "DEPT_CODE"], axis=1, inplace=True)

In [12]:
## Parsing Heating Types

In [13]:
heating_types = []
for value in df["HEATING_TYPES"].unique():
    heating_array = json.loads(value)
    heating_types.extend(heating_array)
print("Unique Heating Types are : ", heating_types)
def parse_heating_type(row, heating_types):
    for heating_type in heating_types:
        if heating_type in row["HEATING_TYPES"]:
            row[heating_type]=1
        else:
            row[heating_type]=0
    return row
df = df.apply(lambda row : parse_heating_type(row, heating_types=heating_types), axis=1)

Unique Heating Types are :  ['ELECTRIC', 'GAS', 'ELECTRIC', 'UNDERFLOOR', 'FUEL']


In [14]:
## Removing duplicate URLS from IMAGEs rows

In [15]:
df["IMAGES"] = df.apply(lambda row:list(set(json.loads(row["IMAGES"]))), axis=1)

In [16]:
## Adding the number of listed images and number of downloaded images

In [17]:
def count_urls(urls):
    return len(urls)

def count_images(id, data_path = "data/images/"):
    list_images = glob.glob( f"{data_path}{id}/*png")
    return len(list_images)

df["NUMBER_URL_IMAGES"]=df.apply(lambda row : count_urls(row["IMAGES"]), axis = 1)
df["NUMBER_VALID_IMAGES"]=df.apply(lambda row : count_images(row["ID"]), axis = 1)

In [18]:
df["NUMBER_URL_IMAGES"]

0        5
1        6
2       16
3        6
4       26
        ..
2159     5
2160     2
2161     0
2162     2
2163     4
Name: NUMBER_URL_IMAGES, Length: 2164, dtype: int64

In [19]:
df["NUMBER_URL_IMAGES_s"]=df.apply(lambda row : count_urls(set(row["IMAGES"])), axis=1)
df["NUMBER_URL_IMAGES_s"]

0        5
1        6
2       16
3        6
4       26
        ..
2159     5
2160     2
2161     0
2162     2
2163     4
Name: NUMBER_URL_IMAGES_s, Length: 2164, dtype: int64

### iii. Dataset validation

In [26]:
!pip install pytest



In [None]:
import pytest 
import ipytest 
ipytest.config.rewrite_asserts=True 
__file___='notebook.ipynb'

ModuleNotFoundError: No module named 'pytest'

In [None]:
df

In [None]:
df_test=df.copy()

@pytest.mark.parametrize("df", [df_test])
def test_df_uniqueess(df):
    df["IMAGES"]=df["IMAGES"].apply(lambda val : str(val))
    console.print(f"There are : {df.duplicated().sum()} duplicate rows" )
    assert df.duplicated().sum()==0

@pytest.mark.parametrize("df", [df_test])
def test_url_uniqueness(df):
    console.print(f"There are : {len(df)-len(df['URL'].unique())} duplicate offer URLS" )
    assert len(df["URL"].unique())/len(df)==1

@pytest.mark.parametrize("df", [df_test])
def test_id_uniqueness(df):
    console.print(f"There are : {len(df)-len(df['ID'].unique())} duplicate IDS" )
    assert len(df["ID"].unique())/len(df)==1

@pytest.mark.parametrize("df", [df_test])
def test_duplicate_images_url_same_rows(df):
    # Create an empty list to store duplicated URLs
    duplicated_urls = []

    # Iterate over each row in the dataframe
    for index, row in df.iterrows():
        # Extract the list of URLs from the row
        url_list = [1] #row['IMAGES']
        
        # Check for duplicate URLs within the list
        if len(url_list) != len(set(url_list)):
            # Append the row index to the duplicated_urls list
            duplicated_urls.append(index)
            
    console.print(f"There are : {len(duplicated_urls)} with duplicate instances of images url")
    assert len(duplicated_urls)==0

@pytest.mark.parametrize("df", [df_test])
def test_duplicate_descriptions(df):
    console.print(f"There are : {df['DESCRIPTION'].duplicated().sum()} duplicate Descriptions" )
    assert df["DESCRIPTION"].duplicated().sum()==0

@pytest.mark.parametrize("df", [df_test])
def test_duplicate_price_events(df):
    console.print(f"There are : {df['PRICE_EVENTS'].duplicated().sum()} duplicate PRICE_EVENTS" )
    assert df["PRICE_EVENTS"].duplicated().sum()==0
    
@pytest.mark.parametrize("df", [df_test])
def test_duplicate_publication_start_date(df):
    console.print(f"There are : {df['PUBLICATION_START_DATE'].duplicated().sum()} duplicate PUBLICATION_START_DATE" )
    assert df["PUBLICATION_START_DATE"].duplicated().sum()==0
    

@pytest.mark.parametrize("df", [df_test])
def test_url_validity(df):
    invalid = df["NUMBER_URL_IMAGES"].sum()-df["NUMBER_VALID_IMAGES"].sum()
    percent=100*invalid/df["NUMBER_URL_IMAGES"].sum()
    console.print(f"There are : {invalid} invalid Image URLS ({percent:.2f}%)" )
    assert invalid==0





In [None]:
ipytest.run("-s")

We can see that we have several duplicates in `DESCRIPTION`, `PRICE_EVENTS`, `PUBLICATION_START_DATE` fields. This might be due to actual duplicate offers, NaNs or default values by the site

In [None]:
# Droping Unused columns
drop_columns = ["URL", "IMAGES", "HEATING_TYPES"]
df.drop(columns = drop_columns, inplace=True)

In [None]:
df

## 2. Finding Duplicates:

A perfect solution to this problem would be to embed each offer into a an abstract space where each point or instance represents offer. Then we can define a distance over such space. Duplicate offers or ads are the instances that have a `distance = 0` between each other. The main issue here is that two duplicate offers can have a `distance > 0`. For example, a user can post the same offer on two sites `bienici` and `leboncoin`. But post different number of pictures, or slightly change the description, add more info, etc between each site.

To solve this issue we'll try to solve this problem by two methods :

- Textual representation
- Image embeddings


####  Edge cases to look out for :

- Duplicate images of logos / Empty images / Floor plans
- Offer for rent and offer for sale at the same time can be counted as duplicates
- Information mismatch between duplicate offers in two different sites or posts

In [None]:
import fiftyone as fo
from tqdm import tqdm
import pandas as pd
import glob
import fiftyone.brain as fob
import fiftyone.core.utils as fou
from collections import Counter
from fiftyone import ViewField as F

In [None]:
def prepare_fiftyone_dataset(df):

    # Create samples for our data
    samples = []
    for i, row in tqdm(df.iterrows(), total=len(df)):
        # price = y_train.iloc[i]["price"]
        # pred = y_pred.iloc[i]["price"]
        id_annonce= row["ID"]
        filepaths = glob.glob(f"data/images/{id_annonce}/*.png")
        # caption = caption_df.iloc[i]["features"]
        # caption=caption.replace('"', "'")
        # caption_split = caption.split("',")


        # Instances that have images 
        if(len(filepaths)>0):
            for j,filepath in enumerate(filepaths) :
                sample = fo.Sample(filepath=filepath)

                # computing the hash for individual images
                sample["file_hash"] = fou.compute_filehash(sample.filepath)

                # for key in row.keys():
                #     sample[key]=row[key]
                # sample["price"]=fo.Regression(value=price) 
                # sample["y_pred"]= fo.Regression(value=pred) 
                # sample["caption"]=caption_split[j]
                # sample["description"]=row["DESCRIPTION"]

                samples.append(sample)
    return samples


In [None]:
samples = prepare_fiftyone_dataset(df)
dataset = fo.Dataset("Real-estate-dataset-3")
dataset.add_samples(samples)
fob.compute_uniqueness(dataset)


filehash_counts = Counter(sample.file_hash for sample in dataset)
dup_filehashes = [k for k, v in filehash_counts.items() if v > 1]

print("Number of duplicate file hashes: %d" % len(dup_filehashes))

dup_view = (dataset
    # Extract samples with duplicate file hashes
    .match(F("file_hash").is_in(dup_filehashes))
    # Sort by file hash so duplicates will be adjacent
    .sort_by("file_hash")
)




In [None]:
print("Number of images that have a duplicate: %d" % len(dup_view))
print("Number of duplicates: %d" % (len(dup_view) - len(dup_filehashes)))

In [None]:
session = fo.launch_app(dataset, address="0.0.0.0", port="5151",remote=True)
session.open_tab()


In [None]:
session