In [1]:
# Import necessary libraries
from Config.config import read_config
from pymongo import MongoClient
import pandas as pd
import numpy as np
from bson.decimal128 import Decimal128

In [2]:
# Module 1: Read MongoDB Airbnb Data
def read_airbnb_sample_data():
    '''Function to read the Airbnb dataset from MongoDB'''
    config = read_config()
    user = config['mongodb']['user']
    password = config['mongodb']['password']
    cluster = config['mongodb']['cluster']
    db_name = config['mongodb']['db']
    collection_name = config['mongodb']['collection']
    
    uri = f'mongodb+srv://{user}:{password}@{cluster}/{db_name}?retryWrites=true&w=majority'
    client = MongoClient(uri)
    db = client[db_name]
    collection = db[collection_name]
    
    documents = list(collection.find())
    return pd.DataFrame(documents)

In [3]:
# Module 2: Copy DataFrame
def copy_dataframe(df):
    '''Function to copy raw DataFrame'''
    return df.copy()

In [4]:
# Module 3: Identify JSON Columns
def find_json_columns(df):
    '''Function to find dictionary-type columns (JSON-like)'''
    json_columns = [col for col in df.columns if df[col].apply(lambda x: isinstance(x, dict)).any()]
    return json_columns

In [5]:
# Module 4: Normalize JSON Columns
def normalize_json_columns(df, json_columns):
    '''Function to normalize JSON columns and merge with the original DataFrame'''
    for column in json_columns:
        normalized_df = pd.json_normalize(df[column])
        df = pd.concat([df.drop(columns=[column]), normalized_df], axis=1)
    return df


In [6]:
# Module 5: Handle Review Scores
def handle_review_scores(df):
    '''Function to handle review scores based on number of reviews'''
    zero_review_scores = {
        'review_scores_accuracy': 0, 'review_scores_cleanliness': 0, 'review_scores_checkin': 0,
        'review_scores_communication': 0, 'review_scores_location': 0, 'review_scores_value': 0, 'review_scores_rating': 0
    }
    nan_review_scores = {
        'review_scores_accuracy': np.nan, 'review_scores_cleanliness': np.nan, 'review_scores_checkin': np.nan,
        'review_scores_communication': np.nan, 'review_scores_location': np.nan, 'review_scores_value': np.nan,
        'review_scores_rating': np.nan
    }

    df['review_scores'] = df.apply(
        lambda row: zero_review_scores if row['number_of_reviews'] == 0 and row['review_scores'] == {} else row['review_scores'], axis=1
    )
    df['review_scores'] = df.apply(
        lambda row: nan_review_scores if row['review_scores'] == {} else row['review_scores'], axis=1
    )
    return df

In [7]:
# Module 6: Extract Location Coordinates
def extract_coordinates(df):
    '''Function to create longitude and latitude columns from location coordinates'''
    df['longitude'] = df['location.coordinates'].apply(lambda x: x[0])
    df['latitude'] = df['location.coordinates'].apply(lambda x: x[1])
    return df.drop(columns=['location.type', 'location.is_location_exact', 'location.coordinates'])

In [8]:
# Module 7: Handle Columns with High Null Values
def drop_high_null_columns(df, threshold=0.75):
    '''Function to drop columns with more than threshold percentage of null values'''
    null_percentage = df.isnull().mean()
    cols_to_drop = null_percentage[null_percentage > threshold].index
    df.drop(columns=cols_to_drop, inplace=True)
    return df


In [9]:
# Module 8: Convert Decimal128 to Float
def convert_decimal128_to_float(df):
    '''Function to convert Decimal128 type to float in DataFrame'''
    def convert(value):
        if isinstance(value, Decimal128):
            return float(value.to_decimal())
        return value
    
    for col in df.columns:
        df[col] = df[col].apply(convert)
    return df

In [10]:
# Module 9: Handle Missing Values
def handle_missing_values(df, null_numeric_columns, null_categorical_columns, null_datetime_columns):
    '''Function to handle missing values by filling or dropping based on column type'''
    for col in null_numeric_columns:
        df[col] = df[col].fillna(df[col].median())
    for col in null_categorical_columns:
        df[col] = df[col].fillna(df[col].mode().iloc[0])
    df.drop(columns=list(null_datetime_columns), inplace=True)
    return df

In [11]:
# Module 10: Separate Reviews Column
def separate_reviews_column(df):
    '''Function to separate the 'review' column into a new DataFrame and remove it from the original DataFrame'''
    if 'reviews' not in df.columns:
        raise ValueError("'review' column is not present in the DataFrame")
    
    review_df = df[['reviews']].copy()
    df = df.drop(columns=['reviews'])
    return df, review_df

In [12]:
# Module 11: Normalize Nested Review Data
def normalize_nested_reviews(df, column_name):
    '''Function to normalize and flatten a DataFrame column containing lists of dictionaries'''
    nested_data = df[column_name]
    normalized_data = nested_data.apply(lambda x: pd.json_normalize(x) if isinstance(x, list) and x else pd.DataFrame())
    flattened_df = pd.concat(normalized_data.tolist(), ignore_index=True)
    return flattened_df

In [13]:
# Module 12: Process Review Data
def process_review_data(review_df):
    '''Function to process review DataFrame by normalizing nested reviews data'''
    normalized_reviews = normalize_nested_reviews(review_df, 'reviews')
    review_df_final = review_df.drop(columns=['reviews']).join(normalized_reviews)
    review_df_final['day'] = review_df_final['date'].apply(lambda x :x).dt.day
    review_df_final['month'] = review_df_final['date'].apply(lambda x :x).dt.month
    review_df_final['year'] = review_df_final['date'].apply(lambda x :x).dt.year
    return review_df_final


In [14]:
# Module 13: Main Function to Process Airbnb Data and Reviews
def process_airbnb_data_with_reviews():
    '''Main function to process Airbnb data and separate reviews column'''
    # Step 1: Read data
    raw_df = read_airbnb_sample_data()
    df = copy_dataframe(raw_df)
    
    # Step 2: Drop columns with more than 75% null values
    df = drop_high_null_columns(df)
    
    # Step 3: Handle review scores
    df = handle_review_scores(df)
    
    # Step 4: Normalize JSON columns
    json_columns = find_json_columns(df)
    df = normalize_json_columns(df, json_columns)
    
    # Step 5: Extract coordinates
    df = extract_coordinates(df)
    
    # Step 6: Convert Decimal128 to float
    df = convert_decimal128_to_float(df)
    
    # Step 7: Handle missing values
    null_columns = [col for col in df.columns if df[col].isnull().sum() > 0]
    null_numeric_columns = df[null_columns].select_dtypes(include='number').columns
    null_categorical_columns = df[null_columns].select_dtypes(include='object').columns
    null_datetime_columns = df[null_columns].select_dtypes(include='datetime64[ns]').columns
    df = handle_missing_values(df, null_numeric_columns, null_categorical_columns, null_datetime_columns)
    
    # Step 8: Separate the 'review' column
    df, review_df = separate_reviews_column(df)
    
    # Step 9: Process review DataFrame
    review_df_final = process_review_data(review_df)
    
    return df, review_df_final


In [15]:
#  Execute the  processing function
final_df, review_df_final = process_airbnb_data_with_reviews()

In [16]:
# Display the final DataFrames
print("Processed Airbnb DataFrame:")
final_df


Processed Airbnb DataFrame:


Unnamed: 0,_id,listing_url,name,summary,space,description,neighborhood_overview,notes,transit,access,...,availability_365,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,review_scores_rating,longitude,latitude
0,10057447,https://www.airbnb.com/rooms/10057447,Modern Spacious 1 Bedroom Loft,"Prime location, amazing lighting and no annoyi...",Lot's of plants and lights. Really great mode...,"Prime location, amazing lighting and no annoyi...",,,,,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-73.591110,45.518890
1,10009999,https://www.airbnb.com/rooms/10009999,Horto flat with small garden,One bedroom + sofa-bed in quiet and bucolic ne...,Lovely one bedroom + sofa-bed in the living ro...,One bedroom + sofa-bed in quiet and bucolic ne...,This charming ground floor flat is located in ...,"There´s a table in the living room now, that d...","Easy access to transport (bus, taxi, car) and ...",,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-43.230750,-22.966254
2,10047964,https://www.airbnb.com/rooms/10047964,Charming Flat in Downtown Moda,Fully furnished 3+1 flat decorated with vintag...,The apartment is composed of 1 big bedroom wit...,Fully furnished 3+1 flat decorated with vintag...,With its diversity Moda- Kadikoy is one of the...,,,,...,362,10.0,10.0,10.0,10.0,10.0,10.0,100.0,29.031330,40.985850
3,10006546,https://www.airbnb.com/rooms/10006546,Ribeira Charming Duplex,Fantastic duplex apartment with three bedrooms...,Privileged views of the Douro River and Ribeir...,Fantastic duplex apartment with three bedrooms...,"In the neighborhood of the river, you can find...",Lose yourself in the narrow streets and stairc...,Transport: • Metro station and S. Bento railwa...,We are always available to help guests. The ho...,...,239,9.0,9.0,10.0,10.0,10.0,9.0,89.0,-8.613080,41.141300
4,1001265,https://www.airbnb.com/rooms/1001265,Ocean View Waikiki Marina w/prkg,A short distance from Honolulu's billion dolla...,Great studio located on Ala Moana across the s...,A short distance from Honolulu's billion dolla...,You can breath ocean as well as aloha.,,Honolulu does have a very good air conditioned...,"Pool, hot tub and tennis",...,343,9.0,8.0,9.0,9.0,10.0,9.0,84.0,-157.839190,21.286340
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5550,9807529,https://www.airbnb.com/rooms/9807529,Upscale Room in The Plateau by LeQube,"Stylish, modern and chic, leQube apartments ar...",All of our homes are instantly bookable and in...,"Stylish, modern and chic, leQube apartments ar...","The apartment is located in a safe, vibrant, a...",This is a 3 story walk-up and this apartment i...,There is easy access to regular transportation...,"48 hours prior to arrival, you will receive se...",...,100,10.0,10.0,10.0,10.0,10.0,10.0,97.0,-73.574530,45.523240
5551,9908871,https://www.airbnb.com/rooms/9908871,Family friendly beach house,"Only 400 metres to the beach,2 x carparks,clos...",,"Only 400 metres to the beach,2 x carparks,clos...",,,,,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,151.282410,-33.888180
5552,9917859,https://www.airbnb.com/rooms/9917859,StayIN Oporto Música - Apartment,Acabamos de decorar-lo para receber os primeir...,,Acabamos de decorar-lo para receber os primeir...,,,,,...,351,9.0,9.0,9.0,9.0,9.0,8.0,89.0,-8.606220,41.151270
5553,9983221,https://www.airbnb.com/rooms/9983221,Cozy apartment downtown Porto,"Renovated, fully equipped, within walking dist...",Very confortable apartment. Ideal for a long w...,"Renovated, fully equipped, within walking dist...",Just in the street 50 meters from the apartmen...,The apartment is in the 2nd floor without elev...,You have the metro nearby (Lapa Station) the w...,The whole apartment.,...,300,10.0,10.0,10.0,10.0,10.0,10.0,98.0,-8.616930,41.151370


In [17]:
# Display the reviews DataFrames
print("\nProcessed Review DataFrame:")
review_df_final


Processed Review DataFrame:


Unnamed: 0,_id,date,listing_id,reviewer_id,reviewer_name,comments,day,month,year
0,68162172,2016-04-02 04:00:00,10047964,33536670,Mihra,"Zeynep was a most welcoming and generous host,...",2,4,2016
1,58663741,2016-01-03 05:00:00,10006546,51483096,Cátia,A casa da Ana e do Gonçalo foram o local escol...,3,1,2016
2,62413197,2016-02-14 05:00:00,10006546,40031996,Théo,"We are french's students, we traveled some day...",14,2,2016
3,68310569,2016-04-04 04:00:00,10006546,53859850,Bart,We had a spledid time in the old centre of Por...,4,4,2016
4,69693942,2016-04-12 04:00:00,10006546,3135623,Alex,Ana and Goncalo were very helpful and accommod...,12,4,2016
...,...,...,...,...,...,...,...,...,...
5550,147306199,2017-04-26 04:00:00,10840938,27734359,Jeff,Stairs Stairs Stairs Building is under constru...,26,4,2017
5551,151760247,2017-05-14 04:00:00,10840938,17504335,Raphaëlle,Super localisation et appartement conformes au...,14,5,2017
5552,155824974,2017-05-28 04:00:00,10840938,108206373,Sun Ah,위치가 매우 좋습니다. 소호는 걸어서 5분 정도 걸려요. 페리 터미널은 택시로 5분...,28,5,2017
5553,159465396,2017-06-11 04:00:00,10840938,14990706,Helene,"Very central, clean and cute place and a nice ...",11,6,2017


In [18]:
review_df_final['date'].apply(lambda x :x).dt.date

0       2016-04-02
1       2016-01-03
2       2016-02-14
3       2016-04-04
4       2016-04-12
           ...    
5550    2017-04-26
5551    2017-05-14
5552    2017-05-28
5553    2017-06-11
5554    2017-07-02
Name: date, Length: 5555, dtype: object