# 1. Right Move Data Cleaning

In [0]:
import pandas as pd
import numpy as np

In [0]:
right_move_brent = spark.table("silver.edgar_sarto_revenue.right_move_brent")
right_move_camden = spark.table("silver.edgar_sarto_revenue.right_move_camden")
right_move_city_of_london = spark.table("silver.edgar_sarto_revenue.right_move_city_of_london")
right_move_city_of_westminster = spark.table("silver.edgar_sarto_revenue.right_move_city_of_westminster")
right_move_ealing = spark.table("silver.edgar_sarto_revenue.right_move_ealing")
right_move_hammersmith_fulham = spark.table("silver.edgar_sarto_revenue.right_move_hammersmith_fulham")
right_move_hownslow = spark.table("silver.edgar_sarto_revenue.right_move_hownslow")
right_move_islington = spark.table("silver.edgar_sarto_revenue.right_move_islington")
right_move_lambeth = spark.table("silver.edgar_sarto_revenue.right_move_lambeth")
right_move_lewisham = spark.table("silver.edgar_sarto_revenue.right_move_lewisham")
right_move_richmond_upon_thames = spark.table("silver.edgar_sarto_revenue.right_move_richmond_upon_thames")
right_move_southwark = spark.table("silver.edgar_sarto_revenue.right_move_southwark")
right_move_tower_hamlets = spark.table("silver.edgar_sarto_revenue.right_move_tower_hamlets")
right_move_wandsworth = spark.table("silver.edgar_sarto_revenue.right_move_wandsworth")

RMBR = right_move_brent.toPandas()
RMCA = right_move_camden.toPandas()
RMCI = right_move_city_of_london.toPandas()
RMWE = right_move_city_of_westminster.toPandas()
RMEA = right_move_ealing.toPandas()
RMHA = right_move_hammersmith_fulham.toPandas()
RMHO = right_move_hownslow.toPandas()
RMIL = right_move_islington.toPandas()
RMLA = right_move_lambeth.toPandas()
RMLE = right_move_lewisham.toPandas()
RMRO = right_move_richmond_upon_thames.toPandas()
RMSO = right_move_southwark.toPandas()
RMTH = right_move_tower_hamlets.toPandas()
RMWA = right_move_wandsworth.toPandas()

## 1.1. City / Brough / Zone (Postal Code) / Agency

In [0]:
##Before unifying everything for each dataset based on its Boroughs

RMBR['Borough'] = 'BRENT'
RMCA['Borough'] = 'CAMDEN'
RMCI['Borough'] = 'CITY OF LONDON'
RMWE['Borough'] = 'CITY OF WESTMINSTER'
RMEA['Borough'] = 'EALING'
RMHA['Borough'] = 'HAMMERSMITH FULLHAM'
RMHO['Borough'] = 'HOWNSLOW'
RMIL['Borough'] = 'ISLINGTON'
RMLA['Borough'] = 'LAMBETH'
RMLE['Borough'] = 'LEWISHAM'
RMRO['Borough'] = 'RICHMOND UPON THAMES'
RMSO['Borough'] = 'SOUTHWARK'
RMTH['Borough'] = 'TOWER HAMLETS'
RMWA['Borough'] = 'WANDSWORTH'


dataframes = [RMBR, RMCA, RMCI, RMWE, RMEA, RMHA, RMHO, RMIL, RMLA, RMLE, RMRO, RMSO, RMTH, RMWA]

RMT = pd.concat(dataframes, ignore_index=True)

RMT['City'] = 'LONDON'

RMT['City'].value_counts()


LONDON    18252
Name: City, dtype: int64

In [0]:
## DROP ROWS WITHOUT A PRICE
RMT = RMT.dropna(subset=['Text'])

In [0]:
RMT = RMT.rename(columns={'Text10':'Street'})

In [0]:
# ZONE (Postal Code)

RMT['Postal_Code'] = RMT['Street'].str.extract(r'(\b[A-Z]{1,2}\d{1,2}\s)')

RMT['Postal_Code'].value_counts()

E14      57
SW11     24
SW17     24
NW10     23
E1       22
         ..
NW8       1
TW12      1
SE26      1
SE14      1
B14       1
Name: Postal_Code, Length: 68, dtype: int64

In [0]:
## AGENCY

RMT['Text12'] = RMT['Text12'].str.upper()
RMT.rename(columns={'Text12': 'Agency'}, inplace=True)
RMT['Agency'].value_counts()

OPENRENT, LONDON                           1172
FOXTONS, FULHAM BROADWAY                    182
FOXTONS, LONDON BRIDGE                      155
FOXTONS, ELEPHANT & CASTLE                  132
INTERLET SALES AND LETTINGS, KENSINGTON     121
                                           ... 
PURPLEBRICKS, COVERING NORTH LONDON           1
ARLINGTON ESTATES, LONDON                     1
VAROSI LETTINGS & ESTATES, LONDON             1
NEILSON & BAUER LTD, ISLINGTON                1
MBL ESTATES LTD, WIMBLEDON                    1
Name: Agency, Length: 1774, dtype: int64

## 1.2. Prices

In [0]:
# MONTHLY PRICE

price = {'£': '', ' pcm': '', ',': ''}

RMT.rename(columns={'Text': 'Monthly_Price'}, inplace=True)

RMT['Monthly_Price'].replace(price, regex=True, inplace=True)

RMT['Monthly_Price'].replace('', np.nan, inplace=True)

RMT.dropna(subset=['Monthly_Price'], inplace=True)

RMT['Monthly_Price'] = RMT['Monthly_Price'].astype(float)

In [0]:
## SECURITY DEPOSIT PRICE

RMT['Security_Deposit'] = RMT['Text2'].str.extract(r'£([\d,]+)')

sec_dep = {',': ''}

RMT['Security_Deposit'].replace(sec_dep, regex=True, inplace=True)

RMT['Security_Deposit'].replace('', np.nan, inplace=True)

RMT.dropna(subset=['Security_Deposit'], inplace=True)

RMT['Security_Deposit'] = RMT['Security_Deposit'].astype(float)

## 1.3. Bedrooms / Bathrooms / Size / Floor / Elevator

In [0]:
## BEDROOMS

def shift_values(row):
    if pd.notna(row['Text8']) and ('agent' in row['Text8']):
        row['Text8'] = None
    if pd.notna(row['Text8']) and ('ft' in row['Text8']):
        row['Text8'] = None
    if row['Text8'] is None:
        row['Text8'] = '0'
    return row

RMT = RMT.apply(shift_values, axis=1)

RMT.rename(columns={'Text8': 'Bedrooms'}, inplace=True)

In [0]:
## BATHROOMS

RMT['Text91'] = RMT['Text9']

def shift_values(row):
    if pd.notna(row['Text9']) and ('agent' in row['Text9']):
        row['Text9'] = None
    if pd.notna(row['Text9']) and ('ft' in row['Text9']):
        row['Text9'] = None
    if row['Text9'] is None:
        row['Text9'] = '0'
    return row

RMT = RMT.apply(shift_values, axis=1)

RMT.rename(columns={'Text9': 'Bathrooms'}, inplace=True)

In [0]:
## SIZE

def shift_values(row):
    if pd.notna(row['Text91']) and ('ft' not in row['Text91']):
        row['Text91'] = None
    return row

RMT = RMT.apply(shift_values, axis=1)

RMT.rename(columns={'Text91': 'Size'}, inplace=True)

ft = {' sq ft': ''}

RMT['Size'].replace(ft, regex=True, inplace=True)

## 1.4. Extras
###### (Av. Date / Lend Type / Parking / Min Tenacy / Furnished / Type)

In [0]:
# AV. DATE

def process_dates(row):
    if pd.notna(row['Text1']):
        if 'Now' in row['Text1']:
            return pd.Timestamp.now().strftime('%Y-%m-%d')
        else:
            date_str = row['Text1'].split(': ')[-1] 
            return pd.to_datetime(date_str, format='%d/%m/%Y', errors='coerce') 
    return None 

RMT['Text1'] = RMT.apply(process_dates, axis=1)

RMT.rename(columns={'Text1': 'Av_Date'})

  RMT['Text1'] = RMT.apply(process_dates, axis=1)


Unnamed: 0,Field,Monthly_Price,Av_Date,Street,Text11,Agency,Text13,Text14,Text15,Text16,Text17,Text18,Text19,Text2,Text20,Text21,Text22,Text3,Text4,Text5,Text6,Text7,Bedrooms,Bathrooms,_Link,_Text,Borough,City,Postal_Code,Security_Deposit,Size
1,,7950.0,2025-03-04,"Dobree Avenue, London, NW10",MARKETED BY,"CAMERONS STIFF & CO, WILLESDEN GREEN, LONDON, ...",4 double bedrooms,Off street parking for several cars,AVAILABLE NOW is this double-fronted 1930s-bui...,Band: G,PARKINGDetails of how and where vehicles can b...,Yes,GARDENA property has access to an outdoor spac...,"Deposit: £11,007A deposit provides security fo...",Ask agent,ACCESSIBILITYHow a property has been adapted t...,Ask agent,Min. Tenancy: Ask agentHow long the landlord o...,Let type: Long term,Furnish type: Unfurnished,Council Tax: Ask agent,Detached,4,3,https://www.rightmove.co.uk/properties/1317574...,,BRENT,LONDON,,11007.0,
3,,1815.0,2025-03-04,"Viscount House, Regency Heights, NW10",MARKETED BY,"UNA LIVING, MANCHESTER",1 DOUBLE BEDROOMS,FREE ACCESS TO BODY CORP WELLNESS & FITTNESS,This modern BTR apartment block is managed ful...,Ask agent,PARKINGDetails of how and where vehicles can b...,Ask agent,GARDENA property has access to an outdoor spac...,"Deposit: £2,095A deposit provides security for...",Ask agent,ACCESSIBILITYHow a property has been adapted t...,Ask agent,Min. Tenancy: Ask agentHow long the landlord o...,Let type: Long term,Furnish type: Part furnished,Council Tax: Ask agent,Flat,1,1,https://www.rightmove.co.uk/properties/1349664...,,BRENT,LONDON,,2095.0,
5,,5200.0,NaT,"Ridley Road, Kensal Green, London, NW10",MARKETED BY,"FOXTONS, WILLESDEN GREEN",Beautiful 3 bedroom duplex flat,Bay fronted reception room with study area,SHORT LET. Situated on a quiet street in Kensa...,Ask agent,PARKINGDetails of how and where vehicles can b...,Ask agent,GARDENA property has access to an outdoor spac...,"Deposit: £1,200A deposit provides security for...","Rear garden,",ACCESSIBILITYHow a property has been adapted t...,Ask agent,Min. Tenancy: Ask agentHow long the landlord o...,Let type: Short term,Furnish type: Ask agent,Council Tax: Ask agent,Flat,3,2,https://www.rightmove.co.uk/properties/1529208...,,BRENT,LONDON,,1200.0,
6,,12502.0,NaT,"Sherrick Green Road, Gladstone Park, London, NW10",MARKETED BY,"FOXTONS, WILLESDEN GREEN",5 bedroom house,Open plan reception room,SHORT LET. Situated on a quiet residential roa...,Ask agent,PARKINGDetails of how and where vehicles can b...,Yes,GARDENA property has access to an outdoor spac...,"Deposit: £2,885A deposit provides security for...","Rear garden,",ACCESSIBILITYHow a property has been adapted t...,Ask agent,Min. Tenancy: Ask agentHow long the landlord o...,Let type: Short term,Furnish type: Ask agent,Council Tax: Ask agent,Semi-Detached,5,6,https://www.rightmove.co.uk/properties/1529212...,,BRENT,LONDON,,2885.0,
7,,7003.0,NaT,"Sherrick Green Road, Gladstone Park, London, NW10",MARKETED BY,"FOXTONS, WILLESDEN GREEN",5 bedroom house,Open plan reception room,LONG LET. Situated on a quiet residential road...,Ask agent,PARKINGDetails of how and where vehicles can b...,Yes,GARDENA property has access to an outdoor spac...,"Deposit: £8,077A deposit provides security for...","Rear garden,",ACCESSIBILITYHow a property has been adapted t...,Ask agent,Min. Tenancy: Ask agentHow long the landlord o...,Let type: Long term,Furnish type: Unfurnished,Council Tax: Ask agent,Semi-Detached,5,6,https://www.rightmove.co.uk/properties/1529209...,,BRENT,LONDON,,8077.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18247,,3497.0,2025-03-04,"York Road, Battersea, London, SW11",MARKETED BY,"KNIGHT FRANK - LETTINGS, BATTERSEA",2 bedrooms,1 reception room,This stylish and contemporary two-bedroom apar...,Ask agent,PARKINGDetails of how and where vehicles can b...,Ask agent,GARDENA property has access to an outdoor spac...,"Deposit: £4,038A deposit provides security for...",Ask agent,ACCESSIBILITYHow a property has been adapted t...,Ask agent,Min. Tenancy: Ask agentHow long the landlord o...,Let type: Long term,Furnish type: Furnished,Council Tax: Ask agent,Flat,2,2,https://www.rightmove.co.uk/properties/1560697...,,WANDSWORTH,LONDON,,4038.0,
18248,,3748.0,2025-03-04,"6 York Road, Battersea, London, SW11",MARKETED BY,"KNIGHT FRANK - LETTINGS, BATTERSEA",2 bedrooms,1 reception room,This stylish and contemporary two-bedroom apar...,Band: F,PARKINGDetails of how and where vehicles can b...,Ask agent,GARDENA property has access to an outdoor spac...,"Deposit: £4,326A deposit provides security for...",Ask agent,ACCESSIBILITYHow a property has been adapted t...,Ask agent,Min. Tenancy: Ask agentHow long the landlord o...,Let type: Long term,Furnish type: Furnished,Council Tax: Ask agent,Flat,2,2,https://www.rightmove.co.uk/properties/8707769...,,WANDSWORTH,LONDON,,4326.0,
18249,,1800.0,2025-03-04,"St. John's Hill, London, SW11",MARKETED BY,"STIRLING ACKROYD LETTINGS, NINE ELMS AND WESTM...",One bedroom,Open-plan living room,No Deposit Option Available For Tenants.\n\nLo...,Band: C,PARKINGDetails of how and where vehicles can b...,Ask agent,GARDENA property has access to an outdoor spac...,"Deposit: £2,076A deposit provides security for...",Ask agent,ACCESSIBILITYHow a property has been adapted t...,Ask agent,Min. Tenancy: 12 monthsHow long the landlord o...,Let type: Long term,Furnish type: Furnished,Council Tax: Ask agent,Apartment,1,1,https://www.rightmove.co.uk/properties/1580560...,,WANDSWORTH,LONDON,,2076.0,
18250,,2500.0,2025-04-01,"Francis House, \n25 Eltringham Street, SW18",MARKETED BY,"CHESTERTONS, WANDSWORTH LETTINGS",A beautifully presented two double bedroom apa...,Former show flat,James Pendleton has the pleasure of introducin...,Band: E,PARKINGDetails of how and where vehicles can b...,Yes,GARDENA property has access to an outdoor spac...,"Deposit: £2,885A deposit provides security for...",Ask agent,ACCESSIBILITYHow a property has been adapted t...,Ask agent,Min. Tenancy: 12 monthsHow long the landlord o...,Let type: Long term,Furnish type: Unfurnished,Council Tax: Ask agent,Flat,2,2,https://www.rightmove.co.uk/properties/1580587...,,WANDSWORTH,LONDON,,2885.0,


In [0]:
# LET TYPE

RMT['Let_Type'] = RMT['Text4']

def shift_values(row):
    if pd.notna(row['Let_Type']) and 'Let type' not in row['Let_Type']:
        row['Let_Type'] = None 
    return row

RMT = RMT.apply(shift_values, axis=1)

n = {'\n': ' '}

RMT['Let_Type'].replace(n, regex=True, inplace=True)

n1 = {'Let type: Long term': '0', 'Let type: Short term': '1'}

RMT['Let_Type'].replace(n1, regex=True, inplace=True)

RMT['Let_Type'].value_counts()

0    12138
1     1517
Name: Let_Type, dtype: int64

In [0]:
# FURNISHED

RMT['Furnished'] = RMT['Text5']

def shift_values(row):
    if pd.notna(row['Furnished']) and 'Unfurnished' in row['Furnished']:
        row['Furnished'] = '0' 
    if pd.notna(row['Furnished']) and 'Ask agent' in row['Furnished']:
        row['Furnished'] = '0'
    if pd.notna(row['Furnished']) and 'Council' in row['Furnished']:
        row['Furnished'] = '0'
    if pd.notna(row['Furnished']) and 'type: Furnished' in row['Furnished']:
        row['Furnished'] = '1'
    if pd.notna(row['Furnished']) and 'type:\nFurnished' in row['Furnished']:
        row['Furnished'] = '1'
    if pd.notna(row['Furnished']) and 'Part furnished' in row['Furnished']:
        row['Furnished'] = '1'
    return row

RMT = RMT.apply(shift_values, axis=1)

RMT['Furnished'].value_counts()

1    9289
0    4900
Name: Furnished, dtype: int64

In [0]:
# PARKING

RMT['Parking'] = None

def shift_values(row):
    if pd.notna(row['Text17']) and 'PARKING' in row['Text17']:
        row['Parking'] = row['Text18']
    if pd.notna(row['Text19']) and 'PARKING' in row['Text19']:
        row['Parking'] = row['Text20']
    return row

RMT = RMT.apply(shift_values, axis=1)


def shift_values(row):
    if pd.notna(row['Parking']):
        if 'Ask agent' in row['Parking']:
            row['Parking'] = '0'
        elif 'Off street' in row['Parking']:
            row['Parking'] = '0'
        elif 'On street' in row['Parking']:
            row['Parking'] = '0'
        elif 'No parking' in row['Parking']:
            row['Parking'] = '0'
        else:
            row['Parking'] = '1'
    else:
        row['Parking'] = '0'
    return row

RMT = RMT.apply(shift_values, axis=1)

In [0]:
# TYPE

RMT['Apt_Type'] = RMT['Text7']

def shift_values(row):
    if pd.notna(row['Apt_Type']) and 'sq ft' in row['Apt_Type']:
        row['Apt_Type'] = None
    if pd.notna(row['Apt_Type']) and 'Ask agent' in row['Apt_Type']:
        row['Apt_Type'] = None
    if pd.notna(row['Text19']) and len(row['Text7']) < 3:
        row['Apt_Type'] = None
    return row

RMT = RMT.apply(shift_values, axis=1)

In [0]:
# MINIMUM TENACY

RMT['Min_Tenacy'] = RMT['Text3'].str.extract(r'(\d+\s+month)')

month = {' month':''}

RMT['Min_Tenacy'].replace(month, regex=True, inplace=True)

RMT['Min_Tenacy'].value_counts()

12     2371
6       416
1       158
3        85
2        48
18       23
24       22
5         8
11        6
30        6
9         5
4         4
8         3
983       1
10        1
36        1
14        1
Name: Min_Tenacy, dtype: int64

In [0]:
RMT['Description'] = RMT['Text15']

## 1.5. Final Dataset

In [0]:
RMT.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14189 entries, 1 to 18251
Data columns (total 37 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Field             14189 non-null  object        
 1   Monthly_Price     14189 non-null  float64       
 2   Text1             10352 non-null  datetime64[ns]
 3   Street            14189 non-null  object        
 4   Text11            14189 non-null  object        
 5   Agency            14189 non-null  object        
 6   Text13            14189 non-null  object        
 7   Text14            14189 non-null  object        
 8   Text15            14189 non-null  object        
 9   Text16            14189 non-null  object        
 10  Text17            14189 non-null  object        
 11  Text18            14189 non-null  object        
 12  Text19            14189 non-null  object        
 13  Text2             14189 non-null  object        
 14  Text20            1418

In [0]:
RMT.drop(columns=['Text1','Text11','Text13','Text14','Text15','Text16','Text17',
                 'Text18','Text19','Text2','Text20','Text21','Text22','Text3','Text4','Text5','Text6','Text7','_Link','_Text'],inplace=True)

In [0]:
RMT.tail

<bound method NDFrame.tail of       Field  ...                                        Description
1            ...  AVAILABLE NOW is this double-fronted 1930s-bui...
3            ...  This modern BTR apartment block is managed ful...
5            ...  SHORT LET. Situated on a quiet street in Kensa...
6            ...  SHORT LET. Situated on a quiet residential roa...
7            ...  LONG LET. Situated on a quiet residential road...
...     ...  ...                                                ...
18247        ...  This stylish and contemporary two-bedroom apar...
18248        ...  This stylish and contemporary two-bedroom apar...
18249        ...  No Deposit Option Available For Tenants.\n\nLo...
18250        ...  James Pendleton has the pleasure of introducin...
18251        ...  A stunning 2 bedroom, 2 bathroom apartment wit...

[14189 rows x 17 columns]>

#

### Try to get the postal code

In [0]:
!pip install geopy

In [0]:
from geopy.geocoders import Nominatim
import time
geolocator = Nominatim(user_agent="my_custom_application", timeout=30000)

In [0]:
def limpiar_direccion(direccion):
    partes = [p.strip() for p in direccion.split(",")]
    if len(partes) == 3:
        return partes[0]
    elif len(partes) >= 4:
        return ", ".join(partes[:2])
    else:
        return direccion.strip()

RMT['Street'] = RMT['Street'].apply(limpiar_direccion)

RMT['Street'].sample(10)

5663     Stunning 3 Bed with Amazing City Views
602                       Lambert Walk, Wembley
5488                             Lancaster Gate
10605                        Moray Road, N4 3LG
6082                      Green Street, Mayfair
4979                                York Street
3808           Moore House, Grosvenor Waterside
2771                                Antrim Road
16628                             Carnation Way
17520                     York Place, Battersea
Name: Street, dtype: object

In [0]:
data = [] 

Test_Geo = pd.DataFrame(data)

Test_Geo['Street'] = RMT['Street'].iloc[0:19000]

place = [Test_Geo['Street']]

In [0]:
from functools import lru_cache
import concurrent.futures

@lru_cache(maxsize=1000)
def geocode_address(address):
    try:
        location = geolocator.geocode(address, timeout=10)
        if location:
            return location.address
        else:
            return None
    except Exception as e:
        return None

def geocode_with_delay(address):
    result = geocode_address(address)
    time.sleep(1)
    return result

addresses = Test_Geo['Street'].tolist()

with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    results = list(executor.map(geocode_with_delay, addresses))

Test_Geo['GeoAddress'] = results


In [0]:
Test_Geo_12 = Test_Geo

In [0]:
Test_Geo = [Test_Geo_1, Test_Geo_2, Test_Geo_3, Test_Geo_4, 
              Test_Geo_5, Test_Geo_6, Test_Geo_7, Test_Geo_8, 
              Test_Geo_9, Test_Geo_10, Test_Geo_11, Test_Geo_12]

Test_Geo = pd.concat(Test_Geo, ignore_index=True)

In [0]:
RMT_Geo = pd.merge(RMT, Test_Geo, left_index=True, right_index=True)

In [0]:
RMT_Geo['Postal_Code_2'] = RMT_Geo['GeoAddress'].str.extract(r'England,\s*([^,]+)')[0]

In [0]:
RMT_Geo_Spark = spark.createDataFrame(RMT_Geo)

RMT_Geo_Spark.write.mode("overwrite").saveAsTable("silver.edgar_sarto_revenue.Right_Move_LONDON_CLEAN")