In [1]:
import pandas as pd
import numpy as np
import pickle
from IPython.display import display
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('wordnet')
pd.options.display.max_columns = None
pd.options.display.max_rows = 100

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Finding Repeated Listings

This notebook contains the code for duplicate detection and removal. Pairwise similarity of description texts are calculated for each postal code region. Then, the similar listings are clustered by the multiplication of number of rooms and living area. The number of clusters which maximize silouhette score are considered to be optimal. A unique index is assigned to each cluster of similar listing descriptions. The unique indices represent repeated listings. The weeks on the market of the repeated listings are not aggregated, only the first occurrence is chosen instead. See 1_DataCleaning.ipynb for details on data cleaning and removal of repeated listings. For further information, please refer to Appendix A.







In [2]:
all_cities_buy = pd.read_pickle("data_final/all_cities_buy.pkl")

In [3]:
all_cities_buy.shape

(1555425, 88)

In [9]:
all_cities_buy.head()

Unnamed: 0,dw_created_at,dw_modified_at,dw_is_active,source_record_id,source_type,source_name,address_geocoded_accuracy,address_geocoded_city,address_geocoded_country,address_geocoded_full,address_geocoded_geometry,address_geocoded_housenr,address_geocoded_postalcode,address_geocoded_street,address_lat,address_lng,apartment_floor_detail,apartment_floor_type,apartmentbuilding_residentials_count,balcony_available,balcony_count_total,balcony_south,basement_available,bath_daylight,bath_quality,construction_accessibility,construction_elevator,construction_quality_object,construction_stage,construction_yearfinished,energy_heating_fireplace,energy_heating_floor,energy_heating_type,energy_ownsolar,energy_type_list,interior_flooring,interior_furnished,interior_quality,investment_rent_active,investment_rent_available,kitchen_available,object_category,object_type,outsideinstallations_conservatory_available,outsideinstallations_garden_available,outsideinstallations_terrace_available,parking_available,parking_total,plot_area_total,plot_lease_available,plot_lease_eternal,state_firstoccupancy,state_modernisation_required,state_object,state_preservation,state_restoration_firstoccupancy,state_restoration_required,state_restoration_structural,structure_area_living,structure_floors,structure_height_building,structure_rooms_baths,structure_rooms_guesttoilets,structure_rooms_living,expose_price_sqm,expose_price_total,expose_offline_since,expose_online_since,expose_online_until,expose_type,expose_urls,expose_content_images,meta_source_id,meta_source_type,address_original_postalcode,energy_certificate_type,energy_certificate_value,state_restoration_year,expose_broker_company,expose_broker_name,expose_content_text,offline_corrected,online_corrected,weeks_on_market,month_advertised,year_advertised,expose_content_text_tokens,token_impute
0,2019-09-10 12:08:38.849021,2019-09-10 12:08:38.849021,True,SrYWXg06tFIw0RYaM59YHrorK0Q=,1,historic,DISTRICT_LARGE,München,,,,,81829,,48.133853,11.688066,,,,True,,,True,False,,,False,,ACTIVE_CONSTRUCTION,,,True,OTHER,,[HEAT_PUMP],,False,,False,,False,HOUSE,ONE_TWO_FAMILY_HOUSE,,True,False,True,,287.0,,,False,,,False,,False,,140.0,4.0,,,1,6.0,6900.0,966000.0,2018-03-27 22:00:00,2018-01-18 23:00:00,2018-03-26 22:00:00,BUY,,,,,,NONE,,,Stadthaus München Wohnbau & Immobilien GmbH,,Objekt\n======\n\nAuf einem sehr schönen und g...,2018-03-26,2018-01-15,11.0,1,2018,objekt schon gut geleg 778 gross grundstuck im...,
1,2019-09-10 12:08:38.849021,2019-09-10 12:08:38.849021,True,TZW7jHPBUupJ6NJ0WKtFJkdqy50=,1,historic,POSTAL_CODE,Bad Berka,,,,,99438,,50.88831,11.254873,,,,True,,,False,False,,,False,,ACTIVE_CONSTRUCTION,2019.0,,False,OTHER,,[ELECTRICITY],,False,,False,,False,HOUSE,,,False,False,True,,,,,False,,,False,,False,,249.0,,,1.0,0,6.0,3015.03012,,2018-04-20 22:00:00,2018-04-14 22:00:00,2018-04-19 22:00:00,BUY,,,,,,NONE,14.7,,"Verkaufsniederlassung Erfurt, KH Massivhaus Dr...",,Objektbeschreibung\nDas Bauhaus Ixeo von Kern-...,2018-04-16,2018-04-09,2.0,4,2018,objektbeschreib bauhaus ixeo uberzeugt stilvol...,
2,2019-09-10 12:08:38.849021,2019-09-10 12:08:38.849021,True,TyEV5wOVoBktRskajCBtaFtirpM=,1,historic,POSTAL_CODE,Dinslaken,,,,,46539,,51.575197,6.797602,1.0,UPPER_FLOOR,,True,,,True,False,,,False,,FINISHED,1973.0,,False,CENTRAL,,[DISTRICT_BASED],,False,,False,,False,APARTMENT,REGULAR_APARTMENT,,False,False,False,,,,,False,,,False,,,,86.46,,,,1,3.5,1781.170483,154000.0,NaT,2018-04-10 22:00:00,2018-05-16 22:00:00,BUY,,[https://valuation-scraper.s3.amazonaws.com/38...,,,,REQUIREMENT_BASED,259.2,,LBS Immobilien GmbH Wesel,,Objekt\n======\n\nZum Verkauf steht eine von d...,2018-05-14,2018-04-09,6.0,4,2018,objekt verkauf steht drei verkauf eigentumswoh...,
3,2019-09-10 12:08:38.849021,2019-09-10 12:08:38.849021,True,TzmrIPkWgcg44XyRRcDEjMamwYE=,1,historic,POSTAL_CODE,Friedrichshafen,,,,,88045,,47.666065,9.460216,,,,True,,,True,False,,,True,,FINISHED_RECENTLY,2018.0,,True,CENTRAL,,,,False,,False,,False,APARTMENT,,,False,False,True,,,,,False,,FINISHED_RECENTLY,False,,False,,79.27,,,,1,3.5,3949.791851,313100.0,NaT,2018-03-14 23:00:00,2018-05-16 22:00:00,BUY,,[https://valuation-scraper.s3.amazonaws.com/38...,,,,NONE,,,Manfred Löffler Wohn-u.Gewerbebau Bauunternehm...,,Objekt\n======\n\nWERTHALTIG AUSGESTATTET. SO ...,2018-05-14,2018-03-12,10.0,3,2018,objekt werthalt ausgestattet fuhl wohl 1 bauab...,objekt werthalt ausgestattet fuhl wohl 1 bauab...
4,2019-09-10 12:08:38.849021,2019-09-10 12:08:38.849021,True,UAOatwUE6IIs08HJmQZiC8490Rs=,1,historic,POSTAL_CODE,Mönchengladbach,,,,,41069,,51.181932,6.397436,,,,False,,,False,False,,,False,,FINISHED,1982.0,,False,CENTRAL,,[GAS],,False,,False,,False,APARTMENT,,,False,False,True,,,,,False,,,False,,,,87.69,,,1.0,0,2.0,1698.027141,148900.0,NaT,2018-04-08 22:00:00,2018-05-16 22:00:00,BUY,,[https://valuation-scraper.s3.amazonaws.com/41...,,,,NONE,,,DIS - Deutsche Immobilien Service GmbH,,Objektbeschreibung\nGegenstand dieses Angebots...,2018-05-14,2018-04-02,7.0,4,2018,objektbeschreib gegenstand angebot ist gepfleg...,


In [16]:
all_cities_buy.online_corrected[all_cities_buy.online_corrected < "2019-03-01"].shape

(1072693,)

In [12]:
all_cities_buy[(all_cities_buy.online_corrected >= "2018-01-01") & (all_cities_buy.online_corrected < "2019-01-01")].shape

(940847, 88)

In [5]:
all_cities_buy.structure_rooms_living.isna().sum()

60373

In [6]:
all_cities_buy = all_cities_buy[~all_cities_buy.address_geocoded_postalcode.isna()]

In [7]:
all_cities_buy = all_cities_buy[~all_cities_buy.structure_rooms_living.isna()]

In [8]:
all_cities_buy = all_cities_buy[~all_cities_buy.structure_area_living.isna()]

In [9]:
all_cities_buy.shape

(1494464, 88)

In [10]:
all_cities_buy["obj_unique_id"] = "unknown"
all_cities_buy["obj_unique_id_clus"] = "unknown"





In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer


def pairwise_similarity(text):
    tfidf = TfidfVectorizer().fit_transform(text)
    pairwise_similarity = tfidf * tfidf.T
    return(pairwise_similarity.toarray())







In [12]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


def clusterer(x):

    x = np.array(x).reshape(-1, 1)
    scores = []
    if x.shape[0]<100:
        r = x.shape[0]
    else:
        r = 100
    for n in range(1,r+1):
        cl = KMeans(n_clusters=n).fit(x)
        preds = cl.predict(x)
        try:
            score=silhouette_score(x,preds, metric='euclidean')
        except:
            score=1000.0
        scores.append(score)
    scores = np.array(scores)
    if all(scores == 1000.0):
        n_clu = 1
    elif 1000.0 in scores:
        idx_clu = np.where(scores == np.max(scores[scores != 1000.0]))[0]
        x_length = np.array(list(range(1,r+1)))
        n_clu = x_length[idx_clu][0]
    else:
        idx_clu = np.where(scores == np.max(scores))[0]
        x_length = np.array(list(range(1,r+1)))
        n_clu = x_length[idx_clu][0]
        
    final_cl = KMeans(n_clusters=n_clu).fit(x)
    final_preds = final_cl.predict(x)
    return(final_preds)




In [13]:


def duplicate_finder(x):
    temp = x.copy()
    mat = pairwise_similarity(temp["expose_content_text_tokens"])
    assigned_rows = np.array(list())
    for i in range(temp.shape[0]):
        row = np.where(mat[i]> 0.9)[0]
        cond = pd.Series(row).isin(assigned_rows).values
        row = row[~cond]
        if row.shape[0] == 0:
            continue
        new_id = temp.iloc[row[0],:]["source_record_id"]
        temp.iloc[row,temp.columns.get_loc("obj_unique_id")] = new_id
        assigned_rows = np.append(assigned_rows,row)
        
        # Clustering
        if row.shape[0] == 1:
            temp.iloc[row,temp.columns.get_loc("obj_unique_id_clus")] = temp.iloc[row,temp.columns.get_loc("obj_unique_id")]
            continue
        
        living_area = temp.iloc[row,temp.columns.get_loc("structure_area_living")]
        rooms = temp.iloc[row,temp.columns.get_loc("structure_rooms_living")] 
            
        mult = living_area * rooms
        if all(mult == np.mean(mult)):
            temp.iloc[row,temp.columns.get_loc("obj_unique_id_clus")] = temp.iloc[row,temp.columns.get_loc("obj_unique_id")]
            continue
        
        clus_id = temp.iloc[row,temp.columns.get_loc("obj_unique_id")].astype("str") +"clu" +clusterer(mult).astype(str)
        temp.iloc[row,temp.columns.get_loc("obj_unique_id_clus")] = clus_id
        
    return temp





In [None]:
from tqdm.auto import tqdm
tqdm.pandas()


all_cities_buy_new = all_cities_buy.groupby("address_geocoded_postalcode").progress_apply(duplicate_finder)






HBox(children=(IntProgress(value=0, max=8249), HTML(value='')))

In [None]:
all_cities_buy_new = all_cities_buy_new.reset_index(level=[1],drop=True)

In [5]:
all_cities_buy_new = all_cities_buy_new.reset_index(drop=True)

In [14]:
#all_cities_buy_new.to_pickle("data_final/all_cities_buy_new.pkl")

In [2]:
all_cities_buy_new = pd.read_pickle("data_final/all_cities_buy_new.pkl")

In [12]:
all_cities_buy_new[all_cities_buy_new.source_record_id != all_cities_buy_new.obj_unique_id]

Unnamed: 0,dw_created_at,dw_modified_at,dw_is_active,source_record_id,source_type,source_name,address_geocoded_accuracy,address_geocoded_city,address_geocoded_country,address_geocoded_full,address_geocoded_geometry,address_geocoded_housenr,address_geocoded_postalcode,address_geocoded_street,address_lat,address_lng,apartment_floor_detail,apartment_floor_type,apartmentbuilding_residentials_count,balcony_available,balcony_count_total,balcony_south,basement_available,bath_daylight,bath_quality,construction_accessibility,construction_elevator,construction_quality_object,construction_stage,construction_yearfinished,energy_heating_fireplace,energy_heating_floor,energy_heating_type,energy_ownsolar,energy_type_list,interior_flooring,interior_furnished,interior_quality,investment_rent_active,investment_rent_available,kitchen_available,object_category,object_type,outsideinstallations_conservatory_available,outsideinstallations_garden_available,outsideinstallations_terrace_available,parking_available,parking_total,plot_area_total,plot_lease_available,plot_lease_eternal,state_firstoccupancy,state_modernisation_required,state_object,state_preservation,state_restoration_firstoccupancy,state_restoration_required,state_restoration_structural,structure_area_living,structure_floors,structure_height_building,structure_rooms_baths,structure_rooms_guesttoilets,structure_rooms_living,expose_price_sqm,expose_price_total,expose_offline_since,expose_online_since,expose_online_until,expose_type,expose_urls,expose_content_images,meta_source_id,meta_source_type,address_original_postalcode,energy_certificate_type,energy_certificate_value,state_restoration_year,expose_broker_company,expose_broker_name,expose_content_text,offline_corrected,online_corrected,weeks_on_market,month_advertised,year_advertised,expose_content_text_tokens,token_impute,obj_unique_id,obj_unique_id_clus
3,2019-08-28 21:28:36.469330,2019-09-19 14:49:02.233626,True,1789908,2,ImmobilienScout24.de,,Leisnig,Germany,"Chemnitzer Str. (00102) 29, 04703 Leisnig, Ger...",0101000020E6100000978BF84ECCDA2940CEDF84420494...,00102),00102,Chemnitzer Str. (,51.156380,12.927340,,,,,,,,,,,,,,,,,,,,,,,,,,HOUSE,MULTI_FAMILY_HOUSE,,,,False,,300.0,False,False,,,,,,,,200.00,,,,0,8.0,190.000000,38000.0,2018-06-25 00:00:00,2018-06-28 00:00:00,2018-06-25 00:00:00,BUY,https://www.immobilienscout24.de/105770537,[https://d13xqcsmyavurl.cloudfront.net/immobil...,,,,NONE,,,Real Estate Virtual Solutions UG (haftungsbesc...,Ot,"Energieausweis: BAUDENKMAL, ENERGIEAUSWEIS NIC...",2018-06-25,2018-06-25,1.0,6,2018,energieausweis baudenkmal energieausweis vorge...,,2074824,2074824
10,2019-08-28 21:28:36.469330,2019-09-19 14:49:02.233626,True,3633260,2,ImmobilienScout24.de,,Dresden,Germany,"Seminarstraße 15, 01067 Dresden, Germany",0101000020E6100000E2E47E87A2702B40A5315A475587...,15,01067,Seminarstraße,51.057290,13.719990,3.0,,,True,,,,,,,True,,FINISHED_RECENTLY,2019.0,,False,OTHER,,[DISTRICT_BASED],,,,False,FREE_NEW,,APARTMENT,REGULAR_APARTMENT,,,,True,,,False,False,True,FEW,FINISHED_RECENTLY,,False,False,,43.00,,,1.0,0,2.0,4511.629883,194000.0,2019-01-22 00:00:00,2018-12-28 00:00:00,2019-01-22 00:00:00,BUY,https://www.immobilienscout24.de/109071883,[https://d13xqcsmyavurl.cloudfront.net/immobil...,,,,REQUIREMENT_BASED,44.0,,Immolounge GenoScout GmbH,Tanja Gallinger,"Absolute Stadtnähe, alles in der Umgebung fußl...",2019-01-28,2018-12-24,6.0,12,2018,absolut stadtnah umgeb fusslauf erreichbar idy...,absolut stadtnah umgeb fusslauf erreichbar idy...,4361933,4361933
24,2019-08-28 21:28:36.469330,2019-09-19 14:49:02.233626,True,2136886,2,ImmobilienScout24.de,,Dresden,Germany,"Seminarstraße 28, 01067 Dresden, Germany",0101000020E6100000E2E47E87A2702B40A5315A475587...,28,01067,Seminarstraße,51.057290,13.719990,0.0,GROUND_FLOOR,,True,,,,,,,True,,FINISHED_RECENTLY,2019.0,,False,OTHER,,[DISTRICT_BASED],,,,False,FREE_NEW,True,APARTMENT,REGULAR_APARTMENT,,True,,True,,,False,False,True,FEW,FINISHED_RECENTLY,,False,False,,60.00,,,1.0,0,2.0,4491.666504,269500.0,2018-07-23 00:00:00,2018-07-27 00:00:00,2018-07-23 00:00:00,BUY,https://www.immobilienscout24.de/106382719,[https://d13xqcsmyavurl.cloudfront.net/immobil...,,,,,,,ImmoLounge powered by GenoScout GmbH,Katja Ebenau,Moderne Wohneinheiten mit Terrasse und hochwer...,2018-07-23,2018-07-23,1.0,7,2018,modern wohnein terrass hochwert einbaukuch erw...,modern wohnein terrass hochwert einbaukuch erw...,3633259,3633259clu0
26,2019-08-28 21:28:36.469330,2019-09-19 14:49:02.233626,True,2136887,2,ImmobilienScout24.de,,Dresden,Germany,"Seminarstraße 28, 01067 Dresden, Germany",0101000020E6100000E2E47E87A2702B40A5315A475587...,28,01067,Seminarstraße,51.057290,13.719990,3.0,,,True,,,,,,,True,,FINISHED_RECENTLY,2019.0,,False,OTHER,,[DISTRICT_BASED],,,,False,FREE_NEW,,APARTMENT,REGULAR_APARTMENT,,,,True,,,False,False,True,FEW,FINISHED_RECENTLY,,False,False,,43.00,,,1.0,0,2.0,4511.627930,194000.0,2018-07-23 00:00:00,2018-07-26 00:00:00,2018-07-23 00:00:00,BUY,https://www.immobilienscout24.de/106364624,[https://d13xqcsmyavurl.cloudfront.net/immobil...,,,,,,,ImmoLounge powered by GenoScout GmbH,Katja Ebenau,"Absolute stadtnähe, alles in der Umgebung fußl...",2018-07-23,2018-07-23,1.0,7,2018,absolut stadtnah umgeb fusslauf erreichbar idy...,absolut stadtnah umgeb fusslauf erreichbar idy...,4361933,4361933
27,2019-08-28 21:28:36.469330,2019-09-19 14:49:02.233626,True,6656718,2,ImmobilienScout24.de,,Dresden,Germany,"Ostra-Allee 21, 01067 Dresden, Germany",0101000020E610000068226C787A752B407B6B60AB0487...,21,01067,Ostra-Allee,51.054830,13.729450,1.0,,,,,,,,,,True,,FINISHED,2019.0,,,,,,,,LUXURY,False,FREE_OLD,,APARTMENT,REGULAR_APARTMENT,,,,True,1.0,,False,False,True,NONE,VERY_GOOD,,True,False,,212.24,,,1.0,0,4.0,7020.354492,1490000.0,2019-09-03 00:00:00,2019-06-17 00:00:00,2019-09-03 00:00:00,BUY,https://www.immobilienscout24.de/112027286,[https://d13xqcsmyavurl.cloudfront.net/immobil...,,,,,,2019.0,CITYMAKLER DRESDEN GmbH + Co. KG,Carsten Hamm,KURFÜRSTLICHE ORANGERIE - HIER erleben Sie hoc...,2019-09-09,2019-06-24,12.0,6,2019,kurfurst orangeri erleb hochmodern wohnraum um...,kurfurst orangeri erleb hochmodern wohnraum um...,6020408,6020408clu2
36,2019-08-28 21:28:36.469330,2019-09-19 14:49:02.233626,True,6209105,2,ImmobilienScout24.de,,Dresden,Germany,"Ostra-Allee 21, 01067 Dresden, Germany",0101000020E610000068226C787A752B407B6B60AB0487...,21,01067,Ostra-Allee,51.054830,13.729450,1.0,,,,,,,,,,True,,FINISHED,2019.0,,,,,,,,LUXURY,False,FREE_OLD,,APARTMENT,REGULAR_APARTMENT,,,,True,1.0,,False,False,True,NONE,VERY_GOOD,,True,False,,69.00,,,1.0,0,2.0,7109.004883,495000.0,2019-08-06 00:00:00,2019-05-22 00:00:00,2019-08-06 00:00:00,BUY,https://www.immobilienscout24.de/111637386,[https://d13xqcsmyavurl.cloudfront.net/immobil...,,,,,,2019.0,CITYMAKLER DRESDEN GmbH + Co. KG,Carsten Hamm,KURFÜRSTLICHE ORANGERIE - HIER erleben Sie hoc...,2019-08-12,2019-05-27,12.0,5,2019,kurfurst orangeri erleb hochmodern wohnraum um...,kurfurst orangeri erleb hochmodern wohnraum um...,6020408,6020408clu1
38,2019-08-28 21:28:36.469330,2019-09-19 14:49:02.233626,True,2574751,2,ImmobilienScout24.de,,Dresden,Germany,"Grüne Str. 28, 01067 Dresden, Germany",0101000020E6100000DD0720B589732B40895E46B1DC86...,28,01067,Grüne Str.,51.053610,13.725660,2.0,,,True,,,,,,,,,FINISHED,1900.0,,,,,[DISTRICT_BASED],,,NORMAL,,,True,APARTMENT,,,,,False,,,False,False,False,SOME,VERY_GOOD,,False,False,,56.75,,,,0,2.0,2132.158691,121000.0,2018-09-24 00:00:00,2018-09-19 00:00:00,2018-09-24 00:00:00,BUY,https://www.immobilienscout24.de/107326952,[https://d13xqcsmyavurl.cloudfront.net/immobil...,,,,CONSUMPTION_BASED,,1999.0,Seifert Immobilien GmbH & Co. KG,Jutta Bürger,BODENBELAG:\nParkett (teilweise)\nBAD:\nBad mi...,2018-09-24,2018-09-17,2.0,9,2018,bodenbelag parkett teilweis bad bad wann kuch ...,bodenbelag parkett teilweis bad bad wann kuch ...,3141235,3141235
39,2019-08-28 21:28:36.469330,2019-09-19 14:49:02.233626,True,6020409,2,ImmobilienScout24.de,,Dresden,Germany,"An der Herzogin Garten 10, 01067 Dresden, Germany",0101000020E6100000DA38622D3E752B401E335019FF86...,10,01067,An der Herzogin Garten,51.054660,13.728990,1.0,,,True,,,,,,True,True,,FINISHED_RECENTLY,2019.0,,,,,,,,LUXURY,False,FREE_NEW,,APARTMENT,REGULAR_APARTMENT,,,,True,2.0,,False,False,True,FEW,FINISHED_RECENTLY,,False,False,,162.19,,,2.0,1,4.0,7090.449707,1150000.0,2019-09-03 00:00:00,2019-05-09 00:00:00,2019-09-03 00:00:00,BUY,https://www.immobilienscout24.de/111421961,[https://d13xqcsmyavurl.cloudfront.net/immobil...,,,,,,2019.0,CITYMAKLER DRESDEN GmbH + Co. KG,Carsten Hamm,Kurfürstliche Orangerie - HIER erleben Sie hoc...,2019-09-09,2019-05-13,18.0,5,2019,kurfurst orangeri erleb hochmodern wohnraum um...,kurfurst orangeri erleb hochmodern wohnraum um...,6020408,6020408clu0
42,2019-08-28 21:28:36.469330,2019-09-19 14:49:02.233626,True,775050,2,ImmobilienScout24.de,,Dresden,Germany,"Salzgasse 4, 01067 Dresden, Germany",0101000020E6100000B16D5166837C2B4051888043A886...,4,01067,Salzgasse,51.052010,13.743190,,,,,,,,,,,,,,2011.0,,,,,,,,,,,,APARTMENT,,,,,False,,,False,False,,,,,,,,50.00,,,,0,2.0,3727.939697,188000.0,2018-03-12 00:00:00,2018-02-08 00:00:00,2018-03-12 00:00:00,BUY,https://www.immobilienscout24.de/103571931,[https://d13xqcsmyavurl.cloudfront.net/immobil...,,,,,,,AZ Agentur für Zwangsversteigerungsinformation...,,"Wohnung Nr. 7 im 2. OG, 50,43 m² Wfl. (vertrag...",2018-03-12,2018-02-05,6.0,2,2018,wohnung nr 7 im 2 og wfl vertrag vereinbart 44...,,738216,738216
47,2019-08-28 21:28:36.469330,2019-09-19 14:49:02.233626,True,4184820,2,ImmobilienScout24.de,,Dresden,Germany,"Am Schießhaus 5, 01067 Dresden, Germany",0101000020E61000006ADE718A8E742B40026553AEF086...,5,01067,Am Schießhaus,51.054220,13.727650,0.0,GROUND_FLOOR,,True,,,True,,,,True,,FINISHED,1997.0,,False,CENTRAL,,"[DISTRICT_BASED, OIL]",,,NORMAL,True,CURRENTLY_RENTED,,APARTMENT,REGULAR_APARTMENT,,,,True,1.0,,False,False,False,SOME,VERY_GOOD,,False,False,,53.00,,,1.0,0,2.0,2250.000000,119745.0,2019-03-19 00:00:00,2019-02-01 00:00:00,2019-03-19 00:00:00,BUY,https://www.immobilienscout24.de/109681495,[https://d13xqcsmyavurl.cloudfront.net/immobil...,,,,CONSUMPTION_BASED,,,Wiesendorf & Geblonsky Immobilien,,Zum Verkauf steht eine modern gestaltete und a...,2019-03-25,2019-02-04,8.0,2,2019,verkauf steht mod gestaltet attraktiv ausgesta...,,4828361,4828361clu0


# Some Examples

In [6]:
all_cities_buy_new[all_cities_buy_new.obj_unique_id == "1617753"][["obj_unique_id_clus","structure_area_living",
                                                                   "structure_rooms_living",
                                                                   "expose_price_total","expose_content_text_tokens",
                                                                   "obj_unique_id",
                                                                  "offline_corrected",
                                                                   "online_corrected",
                                                                   "weeks_on_market"]]

Unnamed: 0,obj_unique_id_clus,structure_area_living,structure_rooms_living,expose_price_total,expose_content_text_tokens,obj_unique_id,offline_corrected,online_corrected,weeks_on_market
1489900,1617753clu3,52.0,3.0,36000.0,wohnung nr 63 im 1 og recht 52 besteh 3 zimm k...,1617753,2018-06-11,2018-06-11,1.0
1489906,1617753clu9,40.0,2.0,26000.0,wohnung nr 64 im 1 og link 40 besteh 2 zimm ku...,1617753,2018-06-11,2018-06-11,1.0
1489921,1617753clu9,40.0,2.0,26000.0,wohnung nr 64 im 1 og link 40 besteh 2 zimm ku...,1617753,2018-06-18,2018-06-18,1.0
1489931,1617753clu3,52.0,3.0,36000.0,wohnung nr 63 im 1 og recht 52 besteh 3 zimm k...,1617753,2018-06-18,2018-06-18,1.0
1489949,1617753clu2,38.0,2.0,24000.0,wohnung nr 62 im eg link 38 besteh 2 zimm kuch...,1617753,2018-06-11,2018-06-11,1.0
1489952,1617753clu7,40.0,1.0,26000.0,wohnung nr 57 im 1 og recht 40 besteh zimm kuc...,1617753,2018-06-18,2018-06-18,1.0
1489980,1617753clu4,28.0,1.0,22000.0,wohnung nr 66 im 2 og link 28 besteh zimm kuch...,1617753,2018-06-18,2018-06-18,1.0
1489985,1617753clu0,68.0,3.0,39000.0,wohnung nr 1 im eg recht besteh 3 zimm flur ku...,1617753,2019-01-14,2019-01-14,1.0
1490013,1617753clu0,68.0,3.0,39000.0,wohnung nr 1 im eg recht besteh 3 zimm flur ku...,1617753,2019-01-07,2019-01-07,1.0
1490019,1617753clu5,50.0,2.0,23000.0,wohnung nr 61 im eg recht 50 besteh 2 zimm kuc...,1617753,2018-06-11,2018-06-11,1.0


In [7]:
all_cities_buy_new[all_cities_buy_new.obj_unique_id == "6651815"][["obj_unique_id_clus","structure_area_living",
                                                                   "structure_rooms_living",
                                                                   "expose_price_total","expose_content_text_tokens",
                                                                   "obj_unique_id",
                                                                  "offline_corrected",
                                                                   "online_corrected",
                                                                   "weeks_on_market"]]

Unnamed: 0,obj_unique_id_clus,structure_area_living,structure_rooms_living,expose_price_total,expose_content_text_tokens,obj_unique_id,offline_corrected,online_corrected,weeks_on_market
1493491,6651815clu1,47.0,2.0,23400.0,wohnung nr 5 im 1 og mitt 47 besteh 2 zimm kuc...,6651815,2019-06-17,2019-06-17,1.0
1493495,6651815clu1,47.0,2.0,23300.0,wohnung nr 1 im eg link 47 besteh 2 zimm kuch ...,6651815,2019-05-13,2019-05-13,1.0
1493500,6651815clu0,63.0,3.0,31500.0,wohnung nr 3 im eg recht 63 besteh 3 zimm kuch...,6651815,2019-06-10,2019-06-03,2.0
1493502,6651815clu0,63.0,3.0,31500.0,wohnung nr 6 im 1 og recht 63 besteh 3 zimm ku...,6651815,2019-04-15,2019-04-15,1.0
1493504,6651815clu1,47.0,2.0,23400.0,wohnung nr 2 im eg mitt 47 wfl besteh 2 zimm k...,6651815,2019-05-06,2019-05-06,1.0
1493505,6651815clu1,47.0,2.0,23300.0,wohnung nr 1 im eg link 47 besteh 2 zimm kuch ...,6651815,2019-05-06,2019-05-06,1.0
1493506,6651815clu0,63.0,3.0,31500.0,wohnung nr 6 im 1 og recht 63 besteh 3 zimm ku...,6651815,2018-10-29,2018-10-29,1.0
1493509,6651815clu0,63.0,3.0,31500.0,wohnung nr 3 im eg recht 63 besteh 3 zimm kuch...,6651815,2018-10-29,2018-10-29,1.0
1493516,6651815clu1,47.0,2.0,23400.0,wohnung nr 2 im eg mitt 47 wfl besteh 2 zimm k...,6651815,2019-04-15,2019-04-15,1.0
1493518,6651815clu1,47.0,2.0,23400.0,wohnung nr 2 im eg mitt 47 wfl besteh 2 zimm k...,6651815,2019-04-29,2019-04-29,1.0


In [13]:
all_cities_buy_new[all_cities_buy_new.obj_unique_id == "5708217"][["obj_unique_id_clus","structure_area_living",
                                                                   "structure_rooms_living",
                                                                   "expose_price_total","expose_content_text_tokens",
                                                                   "obj_unique_id",
                                                                  "offline_corrected",
                                                                   "online_corrected",
                                                                   "weeks_on_market"]]

Unnamed: 0,obj_unique_id_clus,structure_area_living,structure_rooms_living,expose_price_total,expose_content_text_tokens,obj_unique_id,offline_corrected,online_corrected,weeks_on_market
1494254,5708217clu0,49.0,2.0,14000.0,wohnung nr 9 im 1 og link besteh 2 zimm kuch b...,5708217,2019-04-29,2019-04-29,1.0
1494258,5708217clu0,49.0,2.0,14000.0,wohnung nr 9 im 1 og link besteh 2 zimm kuch b...,5708217,2019-05-27,2019-05-27,1.0
1494260,5708217clu0,49.0,2.0,14000.0,wohnung nr 9 im 1 og link besteh 2 zimm kuch b...,5708217,2019-08-05,2019-07-29,2.0
1494264,5708217clu0,49.0,2.0,14000.0,wohnung nr 9 im 1 og link besteh 2 zimm kuch b...,5708217,2019-06-17,2019-06-17,1.0
1494269,5708217clu1,49.1,2.0,14000.0,wohnung nr 9 im 1 og link besteh 2 zimm kuch b...,5708217,2019-07-01,2019-07-01,1.0
1494271,5708217clu1,49.1,2.0,14000.0,wohnung nr 9 im 1 og link besteh 2 zimm kuch b...,5708217,2019-07-15,2019-07-08,2.0
1494275,5708217clu0,49.0,2.0,14000.0,wohnung nr 9 im 1 og besteh 2 zimm kuch bad so...,5708217,2019-02-25,2019-02-25,1.0
1494282,5708217clu0,49.0,2.0,14000.0,wohnung nr 9 im 1 og besteh 2 zimm kuch bad so...,5708217,2019-03-25,2019-03-25,1.0
1494286,5708217clu0,49.0,2.0,14000.0,wohnung nr 9 im 1 og link besteh 2 zimm kuch b...,5708217,2019-05-06,2019-05-06,1.0
1494288,5708217clu0,49.0,2.0,14000.0,wohnung nr 9 im 1 og link besteh 2 zimm kuch b...,5708217,2019-08-05,2019-08-05,1.0
