In [1]:
import pandas as pd

In [2]:
# Load the dataset
df = pd.read_csv('../data/airbnb_lisbon.csv')

In [3]:
# Data overview
df.head(), df.info()
df.dtypes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13578 entries, 0 to 13577
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   room_id               13578 non-null  int64  
 1   survey_id             13578 non-null  int64  
 2   host_id               13578 non-null  int64  
 3   room_type             13578 non-null  object 
 4   country               0 non-null      float64
 5   city                  13578 non-null  object 
 6   borough               0 non-null      float64
 7   neighborhood          13578 non-null  object 
 8   reviews               13578 non-null  int64  
 9   overall_satisfaction  13578 non-null  float64
 10  accommodates          13578 non-null  int64  
 11  bedrooms              13578 non-null  float64
 12  bathrooms             0 non-null      float64
 13  price                 13578 non-null  float64
 14  minstay               0 non-null      float64
 15  name               

room_id                   int64
survey_id                 int64
host_id                   int64
room_type                object
country                 float64
city                     object
borough                 float64
neighborhood             object
reviews                   int64
overall_satisfaction    float64
accommodates              int64
bedrooms                float64
bathrooms               float64
price                   float64
minstay                 float64
name                     object
last_modified            object
latitude                float64
longitude               float64
location                 object
dtype: object

In [4]:
# Check for missing values
missing_values = df.isnull().sum()
missing_values[missing_values > 0]

country      13578
borough      13578
bathrooms    13578
minstay      13578
name            28
dtype: int64

In [5]:
# Drop columns where all values are missing
df = df.dropna(axis=1, how='all')

In [6]:
# Inspect the columns with some missing values
df[df['name'].isna()]

Unnamed: 0,room_id,survey_id,host_id,room_type,city,neighborhood,reviews,overall_satisfaction,accommodates,bedrooms,price,name,last_modified,latitude,longitude,location
7,3049237,1480,15020890,Shared room,Lisbon,Areeiro,0,0.0,1,1.0,174.0,,2017-07-28 01:19:48.101673,38.738405,-9.134452,0101000020E61000007AC37DE4D64422C0F819170E845E...
166,16269727,1480,50575236,Entire home/apt,Lisbon,Santo António,0,0.0,2,1.0,347.0,,2017-07-28 00:30:26.083432,38.718818,-9.150706,0101000020E6100000389F3A56294D22C058E36C3A025C...
241,11961196,1480,43258704,Entire home/apt,Lisbon,Santa Clara,0,0.0,3,1.0,113.0,,2017-07-28 00:06:06.889347,38.782385,-9.158523,0101000020E6100000DF5339ED295122C0C3F011312564...
359,3129185,1480,15891628,Entire home/apt,Lisbon,Penha de França,0,0.0,6,2.0,1201.0,,2017-07-27 23:12:13.730932,38.734562,-9.130074,0101000020E6100000AC1E300F994222C03271AB20065E...
367,3129127,1480,15891325,Entire home/apt,Lisbon,Benfica,0,0.0,4,1.0,1201.0,,2017-07-27 23:12:13.705307,38.755116,-9.197867,0101000020E61000004FEACBD24E6522C0DA571EA4A760...
372,3104053,1480,4967811,Entire home/apt,Lisbon,Campo de Ourique,0,0.0,5,3.0,840.0,,2017-07-27 23:12:12.338974,38.719404,-9.169104,0101000020E6100000DE3EABCC945622C0484E266E155C...
376,3079140,1480,15011587,Entire home/apt,Lisbon,São Domingos de Benfica,0,0.0,2,1.0,781.0,,2017-07-27 23:12:12.326194,38.744064,-9.174446,0101000020E610000006D671FC505922C0C310397D3D5F...
378,3118028,1480,15843103,Entire home/apt,Lisbon,Benfica,0,0.0,2,1.0,840.0,,2017-07-27 23:12:12.321135,38.7528,-9.195614,0101000020E61000008D47A984276422C0E3361AC05B60...
383,3118839,1480,15846016,Entire home/apt,Lisbon,Lumiar,0,0.0,6,3.0,600.0,,2017-07-27 23:12:11.247552,38.767741,-9.155973,0101000020E6100000361E6CB1DB4F22C030664B564562...
580,2988197,1480,15237172,Entire home/apt,Lisbon,Avenidas Novas,0,0.0,2,2.0,240.0,,2017-07-27 23:11:50.249008,38.729434,-9.157139,0101000020E610000030DAE385745022C0944BE3175E5D...


In [7]:
# Fill missing values and check result
df['name'] = df['name'].fillna('Unknown name') 
df.isnull().sum()

room_id                 0
survey_id               0
host_id                 0
room_type               0
city                    0
neighborhood            0
reviews                 0
overall_satisfaction    0
accommodates            0
bedrooms                0
price                   0
name                    0
last_modified           0
latitude                0
longitude               0
location                0
dtype: int64

In [8]:
# Check and remove duplicates
df.duplicated().sum()

0

In [9]:
# Convert data types
df['last_modified'] = pd.to_datetime(df['last_modified']) 

In [10]:
# Get year and month from last_modified column
df['year_modified'] = df['last_modified'].dt.year
df['month_modified'] = df['last_modified'].dt.month

In [11]:
# Normalize text data
# Select columns that contain text data
object_columns = df.select_dtypes(include='object')

In [12]:
# Check for empty strings (strings that contain only spaces)
empty_strings = object_columns.apply(lambda x: x.str.strip() == '')  
empty_strings.sum()

room_type       0
city            0
neighborhood    0
name            0
location        0
dtype: int64

In [13]:
# Convert all text data to lowercase
df[object_columns.columns] = object_columns.apply(lambda x: x.str.lower())
df.head(15) #Inspect the result

Unnamed: 0,room_id,survey_id,host_id,room_type,city,neighborhood,reviews,overall_satisfaction,accommodates,bedrooms,price,name,last_modified,latitude,longitude,location,year_modified,month_modified
0,14708916,1480,91501272,shared room,lisbon,santo antónio,19,4.5,4,1.0,30.0,low cost hostel marques gardens 4 bed dorm,2017-07-28 01:19:53.215272,38.723987,-9.146613,0101000020e61000003f56f0db104b22c055dd239bab5c...,2017,7
1,7465447,1480,704061,shared room,lisbon,avenidas novas,4,3.5,6,1.0,39.0,room in lisbon center,2017-07-28 01:19:53.206052,38.735061,-9.15451,0101000020e610000057cf49ef1b4f22c054c8957a165e...,2017,7
2,11058290,1480,1379661,shared room,lisbon,santa maria maior,38,4.5,8,1.0,39.0,bed in a 8-bed dorm @ this is lisbon hostel,2017-07-28 01:19:52.034547,38.715726,-9.132671,0101000020e610000057410c74ed4322c0cd72d9e89c5b...,2017,7
3,9019111,1480,46762399,shared room,lisbon,avenidas novas,9,4.0,6,1.0,37.0,shared mixed bunkbed room 1,2017-07-28 01:19:52.020396,38.729017,-9.149932,0101000020e61000006c3f19e3c34c22c0309dd66d505d...,2017,7
4,3123304,1480,8488955,shared room,lisbon,arroios,0,0.0,2,1.0,480.0,quarto no centro de lisboa,2017-07-28 01:19:50.401364,38.721617,-9.136208,0101000020e61000003ae8120ebd4522c06b9e23f25d5c...,2017,7
5,3110303,1480,15806678,shared room,lisbon,arroios,0,0.0,2,1.0,480.0,uefa lisboa,2017-07-28 01:19:50.398554,38.732262,-9.134285,0101000020e6100000cdafe600c14422c0410ddfc2ba5d...,2017,7
6,13514728,1480,77597089,shared room,lisbon,lumiar,0,0.0,2,1.0,180.0,autocaravana com condutor,2017-07-28 01:19:48.104348,38.761582,-9.170725,0101000020e61000001d386744695722c0739cdb847b61...,2017,7
7,3049237,1480,15020890,shared room,lisbon,areeiro,0,0.0,1,1.0,174.0,unknown name,2017-07-28 01:19:48.101673,38.738405,-9.134452,0101000020e61000007ac37de4d64422c0f819170e845e...,2017,7
8,19037719,1480,8477034,shared room,lisbon,são vicente,0,0.0,4,1.0,144.0,orange placito,2017-07-28 01:19:48.098744,38.718112,-9.128895,0101000020e6100000603c8386fe4122c0bd6e1118eb5b...,2017,7
9,17286309,1480,95779617,shared room,lisbon,arroios,0,0.0,1,1.0,144.0,"apartamento, quarto sala",2017-07-28 01:19:47.156007,38.732699,-9.135252,0101000020e6100000eb6f09c03f4522c0ec67b114c95d...,2017,7


In [14]:
# Clear names from invalid symbols
df['name'] = df['name'].str.replace(r'[^a-zA-Z0-9\s]', ' ', regex=True)
df['name'].head(15) #Inspect the result

0      low cost hostel marques gardens 4 bed dorm
1                           room in lisbon center
2     bed in a 8 bed dorm   this is lisbon hostel
3                     shared mixed bunkbed room 1
4                      quarto no centro de lisboa
5                                     uefa lisboa
6                       autocaravana com condutor
7                                    unknown name
8                                  orange placito
9                        apartamento  quarto sala
10                            shared dorms lisbon
11       large artistic room   lisbon city center
12                                nice room anjos
13                            room almirante reis
14             lisboa arte hostel   shared room 2
Name: name, dtype: object

In [15]:
# Delete extra spaces
df['name'] = df['name'].str.replace(r'\s+', ' ', regex=True)
df['name'] = df['name'].str.strip()
df['name'].head(15) #Inspect the result

0     low cost hostel marques gardens 4 bed dorm
1                          room in lisbon center
2      bed in a 8 bed dorm this is lisbon hostel
3                    shared mixed bunkbed room 1
4                     quarto no centro de lisboa
5                                    uefa lisboa
6                      autocaravana com condutor
7                                   unknown name
8                                 orange placito
9                        apartamento quarto sala
10                           shared dorms lisbon
11        large artistic room lisbon city center
12                               nice room anjos
13                           room almirante reis
14              lisboa arte hostel shared room 2
Name: name, dtype: object

In [16]:
# Check for negative values in columns that should not have negative values
df[df[['price', 'bedrooms', 'reviews', 'overall_satisfaction', 'accommodates']].lt(0).any(axis=1)]

Unnamed: 0,room_id,survey_id,host_id,room_type,city,neighborhood,reviews,overall_satisfaction,accommodates,bedrooms,price,name,last_modified,latitude,longitude,location,year_modified,month_modified


In [17]:
# Check for unrealistic dates (e.g., future dates)
df[df['last_modified'] > pd.to_datetime('today')]

Unnamed: 0,room_id,survey_id,host_id,room_type,city,neighborhood,reviews,overall_satisfaction,accommodates,bedrooms,price,name,last_modified,latitude,longitude,location,year_modified,month_modified


In [18]:
# Final check for missing values and duplicates
df.isnull().sum()
df.duplicated().sum()

0

In [19]:
# Save the result
df.to_csv('../data/cleaned_data.csv', index=False)