In [None]:
# %pip install dataprep # <- geht nicht, irgendwas mit lievenshtein


In [6]:
import pandas as pd


data = "Airbnb_Open_Data.csv"
df = pd.read_csv(data, delimiter=',', low_memory=False)

# remove duplicates
shape_before_remove_duplicates = df.shape[0]
df = df.drop_duplicates("id", keep='first')
shape_after_remove_duplicates = df.shape[0]
print("Dropped: {}".format(shape_before_remove_duplicates - shape_after_remove_duplicates))

# check for duplicates
ids = df["id"]
df[ids.isin(ids[ids.duplicated()])].sort_values("id")

Dropped: 541


Unnamed: 0,id,NAME,host_id,host_identity_verified,host_name,neighbourhood_group,neighbourhood,lat,long,country,...,service_fee,minimum_nights,number_of_reviews,last_review,reviews_per_month,review_rate_number,calculated_host_listings_count,availability_365,house_rules,license


In [7]:
# Drop licences because there are very few entries in it

df = df.drop(['license'], axis=1)

In [8]:
# make service_fee and price numeric and remove '$'

df['service_fee'] = df['service_fee'].replace('[^\d.]', '', regex=True).astype(float)
df['price'] = df['price'].replace('[^\d.]', '', regex=True).astype(float) 
pd.to_numeric(df['service_fee'], errors='raise')
pd.to_numeric(df['price'], errors='raise')

# df.dtypes
# df

0          966.0
1          142.0
2          620.0
3          368.0
4          204.0
           ...  
102053     696.0
102054     909.0
102055     387.0
102056     848.0
102057    1128.0
Name: price, Length: 102058, dtype: float64

In [9]:
# check last review column and update date time format and Drop all rows where year > 2024

before = df.shape[0]

print("Starting date time check")

def check_date_format(date_str):
    if pd.isna(date_str):
        return True 
    try:
        pd.to_datetime(date_str, format='%m/%d/%Y', errors='raise')
        return True
    except ValueError:
        return False
    
date_format_check = df['last_review'].apply(check_date_format)

all_dates_correct = date_format_check.all()

if all_dates_correct:
    print("All dates are in the correct format: MM/DD/YYYY")
else:
    print("There are dates not in the correct format:")
    print(df[~date_format_check])
    
print("Dates in correct format:", date_format_check)

# df.to_csv('datacleanv02.csv', index=True)


df['last_review'] = pd.to_datetime(df['last_review'], errors='coerce')

cutoff_date = pd.Timestamp('2023-12-31')

# Drop rows where 'last_review' is later than the cutoff date
df_clean = df[df['last_review'] <= cutoff_date]

after = df_clean.shape[0]

print("Rows removed due to date constraints: {}".format(before - after))

df_clean[["last_review"]].describe()

Starting date time check
All dates are in the correct format: MM/DD/YYYY
Dates in correct format: 0         True
1         True
2         True
3         True
4         True
          ... 
102053    True
102054    True
102055    True
102056    True
102057    True
Name: last_review, Length: 102058, dtype: bool
Rows removed due to date constraints: 15837


Unnamed: 0,last_review
count,86221
mean,2019-06-10 18:16:17.274678016
min,2012-07-11 00:00:00
25%,2018-10-27 00:00:00
50%,2019-06-13 00:00:00
75%,2019-07-05 00:00:00
max,2022-05-21 00:00:00


In [10]:
# replace line breaks

df_clean = df_clean.replace('\n',' ', regex=True)

In [11]:
# make sure everything is unique and that there are no typos in the text fields

def get_uniques_in_data(df):
    print(df['room_type'].unique())
    print(df['cancellation_policy'].unique())
    print(df['instant_bookable'].unique())
    print(df['country_code'].unique())
    print(df['country'].unique())
    print(df['neighbourhood_group'].unique())
    print(df['host_identity_verified'].unique())

    neighbourhoods = df_clean['neighbourhood'].unique()

    arr = []

    for element in neighbourhoods:
        arr.append(str(element))


    arr.sort()

    # for element in arr:
        # print(element)


get_uniques_in_data(df_clean)


['Private room' 'Entire home/apt' 'Shared room' 'Hotel room']
['strict' 'moderate' 'flexible' nan]
[False True nan]
['US' nan]
['United States' nan]
['Brooklyn' 'Manhattan' 'brookln' 'manhatan' 'Queens' nan 'Staten Island'
 'Bronx']
['unconfirmed' 'verified' nan]


In [12]:
# clean min and max values for availability_365, minimum nights and price columns

def cleanData(df):
    initialSize = df.shape[0]
    filtered_df = df[(df['availability_365'] <= 365) & (df['minimum_nights'] <= 2645) & (df['price'] >= 0)]
    filtered_df = filtered_df[(df['availability_365'] >= 0) & (df['minimum_nights'] >= 0)]
    filtered_df['service_fee'] = df['service_fee'].astype(float).fillna(0.0)
    filtered_df['price'] = df['price'].astype(float).fillna(0.0)
    filteredSize = filtered_df.shape[0]
    if (initialSize != filteredSize):
        print("cleanData() has rows removed: " + str(initialSize - filteredSize) + "\n")
    return filtered_df

df_clean = cleanData(df_clean)

# df_clean

cleanData() has rows removed: 3293



  filtered_df = filtered_df[(df['availability_365'] >= 0) & (df['minimum_nights'] >= 0)]


In [13]:
df_clean["availability_365"].describe()

df_clean.to_csv('Airbnb_Open_Data.cleaned.csv', index=True)