In [11]:
import pandas as pd
import numpy as np

# Assuming you have loaded your calendar dataset into a DataFrame named 'calendar_data'
# Let's assume your 'calendar_data' has columns: 'listing_id', 'date', 'price', 'availability'


# Extract season from date
def get_season(month):
    if month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    elif month in [9, 10, 11]:
        return 'Autumn'
    else:
        return 'Winter'

#data is the cleaned_calendar dataset
#inserting season in as a column
data['season'] = data['date'].dt.month.map(get_season)

# Group by listing_id and season, and randomly select one row from each group
simplified_calendar_data = data.groupby(['listing_id', 'season']).apply(lambda x: x.sample(1)).reset_index(drop=True)

# Now 'simplified_calendar_data' contains only one random day per season for each listing_id
# Now you can use 'simplified_calendar_data' for further analysis

#lastly we also drop the minimum_nights and maximum nights
columns_to_drop = ["minimum_nights" , "maximum_nights", "available","adjusted_price"]
simplified_calendar_data = simplified_calendar_data.drop(columns_to_drop, axis=1)

In [12]:
simplified_calendar_data.head()

Unnamed: 0,listing_id,date,price,weekday,season
0,5396,2023-09-06,530.0,2,Autumn
1,5396,2023-05-14,126.0,6,Spring
2,5396,2023-07-16,148.0,6,Summer
3,5396,2023-12-26,150.0,1,Winter
4,7397,2023-11-27,130.0,0,Autumn


In [13]:
#renaming so that both datasets have the same column name
simplified_calendar_data.rename(columns={'listing_id': 'id'}, inplace=True)

In [14]:
# Assuming you have two DataFrames: simplified_calendar_data and listings_data
# Let's assume 'dataframe1' has columns: 'listing_id', and other relevant features

#dataframe1 is the cleaned_listings dataset
# Merge the two datasets on 'id'
merged_data = pd.merge(dataframe1, simplified_calendar_data, on='id', how='inner')

# Now 'merged_data' contains the combined information from both datasets
# You can proceed with your analysis using this merged dataset

In [15]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162321 entries, 0 to 162320
Columns: 150 entries, id to season
dtypes: bool(99), datetime64[ns](4), float64(14), int32(1), int64(23), object(9)
memory usage: 77.9+ MB


In [16]:
print(merged_data.isnull().mean())

id                    0.0
host_id               0.0
host_since            0.0
host_response_time    0.0
host_response_rate    0.0
                     ... 
total_amenities       0.0
date                  0.0
price_y               0.0
weekday               0.0
season                0.0
Length: 150, dtype: float64


In [17]:
#removing the initial price column from the listings data and replacing it with the one sampled per season from the listign
merged_data = merged_data.drop('price_x', axis=1)
merged_data.rename(columns={'price_y': 'price'}, inplace=True)

In [18]:
#downloading the cleaned_dataset
merged_data.to_excel('merged_data.xlsx', index=False)