In [3]:
# Data cleaning and exploring libraries
import pandas as pd 
import numpy as np

# Data visualization libraries
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sn
import plotly.express as px
from plotly.subplots import make_subplots 

import warnings
warnings.filterwarnings("ignore")



# Task 1: Data Loading

In [6]:
# 1
dataset = pd.read_csv ("‪C:\\Users\\danny\\OneDrive\\Desktop\\PYTHON DATA\\AIRBNB DATA ANALYTICS.ipynb-checkpoint-checkpoint.ipynb.csv")

OSError: [Errno 22] Invalid argument: '\u202aC:\\Users\\danny\\OneDrive\\Desktop\\PYTHON DATA\\AIRBNB DATA ANALYTICS.ipynb-checkpoint-checkpoint.ipynb.csv'

In [None]:
# 2
dataset.head(5)

In [None]:
# 3
dataset.dtypes

In [None]:
# 1

## Create list of columns to drop
columns_to_drop = ['host id', 'id', 'country', 'country code']

## Drop columns from dataframe and save in new dataframe
airbnb_dataset =dataset.drop(columns_to_drop, axis=1)

airbnb_dataset.head(5)

# Task 2b: Data Cleaning

# 1. Check for missing values in the dataframe and display the count in ascending order. 
# If the values are missing impute the values as per the data type of the column
# 2. Check whether there are any duplicate values in the dataframe, and if present, remove them
# 3. Display the total number of records in the dataframe after removing the duplicates.

In [None]:
# 1.  Check for missing values in the dataframe and display the count in ascending order.
missing_values = airbnb_dataset.isnull().sum().sort_values(ascending=True)
missing_values

In [None]:
# 1b Impute missing values based on column data types
for column in airbnb_dataset.columns:
    if airbnb_dataset[column].dtype == 'object': 
        
        # Impute missing values with an empty string for object/string columns
        airbnb_dataset[column].fillna('', inplace=True)
    else: 
        # Impute missing values with the mean for numeric columns
        airbnb_dataset[column].fillna(airbnb_dataset[column].mean(), inplace=True)

airbnb_dataset.head(2)

In [None]:
airbnb_dataset=airbnb_dataset.drop(['house_rules', 'license', 'last review', 'calculated host listings count', 'availability 365', 'review rate number', 'reviews per month',], axis=1)

In [None]:
airbnb_dataset.head(2)

In [None]:
# 2 Check whether there are any duplicate values in the dataframe and if present remove them.

initial_records = len(airbnb_dataset)
initial_records

In [None]:
airbnb_dataset.drop_duplicates(keep=False, inplace=True)

In [None]:
airbnb_dataset.duplicated().sum()

In [None]:
# Display the total number of records in the dataframe after removing the duplicates.

airbnb_dataset.drop_duplicates(inplace=True)
final_records = len(airbnb_dataset)

final_records

# Task 3: Data Transformation 

In [None]:
# 1
airbnb_dataset.rename(columns={'availability 365': 'days_booked'}, inplace=True)
airbnb_dataset.head(5)

In [None]:
# Convert all column names to lowercase and replace the spaces with an underscore "_"
airbnb_dataset.columns = airbnb_dataset.columns.str.lower().str.replace(' ', '_')
airbnb_dataset.head(3)

In [None]:
## Remove the dollar sign and comma from the columns. If necessary, convert these two columns to the appropriate data type.
airbnb_dataset['price'] = airbnb_dataset['price'].replace({'\$': '', ',': ''}, regex=True)


airbnb_dataset['service_fee'] = airbnb_dataset['service_fee'].replace({'\$': '', ',': ''}, regex=True)


airbnb_dataset.head(2)

# Task 4: Exploratory Data Analysis

In [None]:
# List the count of various room types available in the dataset.
# Which room type has the most strict cancellation policy?
# List the average price per neighborhood group, and highlight the most expensive neighborhood to rent from

In [None]:
# List the count of various room types available with Airbnb
room_type_counts = airbnb_dataset['room_type'].value_counts()
room_type_counts

In [None]:
# # Which room type adheres to more strict cancellation policy

# Count by strict cancellation policy = strict by room type and find the mean occurrence across room types
average_cancellation = airbnb_dataset.groupby('room_type')['cancellation_policy'].apply(lambda x: (x == 'strict').mean())

room_type_most_strict = average_cancellation.idxmax()

room_type_most_strict

In [None]:
## List the average prices by neighborhood, sort most expensive to least
airbnb_dataset['price'] = pd.to_numeric(airbnb_dataset['price'], errors='coerce')
average_price_neighbourhood = airbnb_dataset.groupby('neighbourhood')['price'].mean().sort_values(ascending=False)

average_price_neighbourhood

In [None]:
# Remove any empty values of neighbourhood group or neighbourhood which might dispute the analysis later on
airbnb_clean = airbnb_dataset[(airbnb_dataset['neighbourhood_group'] != '') & (airbnb_dataset['neighbourhood'] != '')]

## List the average prices by neighborhood group, sort most expensive to least
average_price_neighbourhood_group = airbnb_clean.groupby('neighbourhood_group')['price'].mean().sort_values(ascending=False)
average_price_neighbourhood_group


In [None]:
# Get the most expensive neighborhood
most_expensive_neighbourhood = average_price_neighbourhood.idxmax()
most_expensive_neighbourhood

In [None]:
# Get the most expensive neighborhood group
most_expensive_neighbourhood_group = average_price_neighbourhood_group.idxmax()
most_expensive_neighbourhood_group

# Task 5a: Data Visualization 

In [None]:
# 1. Create a horizontal bar chart to display the top 10 most expensive neighborhoods in the dataset.Create another chart with the 10 cheapest neighborhoods in the dataset.

# Top 10 most expensive neighbourhoods
top_10 = average_price_neighbourhood.head(10)
top_10

In [None]:
# Display horizontal bar chart using seaborn
plt.figure(figsize=(10, 8))
plt.figure(figsize=(10, 6))
plt.barh(top_10.index, top_10.values, color='blue')

# Add Labels
plt.xlabel('Average Price ($)')
plt.ylabel('Neighborhoods')
plt.title('Top 10 Most Expensive Neighborhoods')
plt.gca().invert_yaxis()  # Invert the y-axis to display highest price at the top

# Add details
plt.xticks(rotation=45, ha='right')
plt.yticks(fontsize=10)
plt.grid(axis='x', linestyle='--')

# Add data labels to the bars
for index, value in enumerate(top_10.values):
    plt.text(value, index, f'${value:.2f}', va='center')

# Display Chart
plt.tight_layout()
plt.show()

In [None]:
# Display the 10 cheapest neighborhoods
bottom_10 = average_price_neighbourhood.tail(10)
bottom_10

In [None]:
# Create the horizontal bar chart, # Invert the y-axis to display lowest price at the bottom
plt.figure(figsize=(10, 6))
plt.barh(bottom_10.index, bottom_10.values, color='orange')
plt.xlabel('Average Price ($)')
plt.ylabel('Neighborhoods')
plt.title('Top 10 Cheapest Neighborhoods')
plt.gca().invert_yaxis()  
# Add details
plt.xticks(rotation=45, ha='right')
plt.yticks(fontsize=10)
plt.grid(axis='x', linestyle='--')

# Add data labels to the bars
for index, value in enumerate(bottom_10.values):
    plt.text(value, index, f'${value:.2f}', va='center')

# Display Chart
plt.tight_layout()
plt.show()



In [None]:
# Create a box and whisker chart
plt.figure(figsize=(10, 6))
ax = sns.boxplot(x='room_type', y='price', data=airbnb_dataset)
plt.xlabel('Room Type')
plt.ylabel('Price')
plt.title('Price Distribution of Listings by Room Type')
plt.xticks(rotation=45)

# Add label for median
medians = airbnb_dataset.groupby('room_type')['price'].median()
room_types = airbnb_dataset['room_type'].unique()

for xtick, label in enumerate(ax.get_xticklabels()):
    ax.text(xtick, medians[xtick] - 100, f"Median: ${medians[xtick]:.2f}", 
            ha='center', va='top', fontsize=10)

plt.xticks(range(len(room_types)), room_types)  # Set custom x-tick labels
plt.tight_layout()
plt.show()

# Task 5b: Data Visualization

In [None]:
plt.scatter( x = 'service_fee', y = 'price', data = airbnb_dataset) 
# Add details
plt.xlabel('service Fee')
plt.ylabel('price')
plt.title('Relationship between Service Fee and Room Price')

# NOETEabs
# There is a positive relationship between the room price to the service fee shown in the scatter plot after converting the service fee to numeric values and removing the non-numeric alues to NaN. It seems the higher the room price, the higher the service fee is.

# Task 5c: Data Visualization

In [None]:
# Are verified host generally more expensive?
# Create a box plot or violin plot to compare prices for verified and unverified users
plt.figure(figsize=(10, 6))
ax = sns.violinplot(x='host_identity_verified', y='price', data=verified_airbnb)
plt.xlabel('Host Identity Verified')
plt.ylabel('Price')
plt.title('Price Distribution for Verified and Unverified Users')
plt.xticks([0, 1], ['Unverified', 'Verified'])

# Add labels for median and interquartile ranges
medians = verified_airbnb.groupby('host_identity_verified')['price'].median()
q1 = verified_airbnb.groupby('host_identity_verified')['price'].quantile(0.25)
q3 = verified_airbnb.groupby('host_identity_verified')['price'].quantile(0.75)

for xtick, label in enumerate(ax.get_xticklabels()):
    ax.annotate(f"Median: ${medians[xtick]:.2f}", (xtick, medians[xtick]), 
                xytext=(5, 5), textcoords='offset points', ha='center', va='bottom', fontsize=10)
    ax.annotate(f"IQR: ${q1[xtick]:.2f} - ${q3[xtick]:.2f}", (xtick, q3[xtick]), 
                xytext=(5, -110), textcoords='offset points', ha='center', va='bottom', fontsize=10)


plt.tight_layout()
plt.show()

In [None]:
# Group the data by 'neighbourhood_group' and 'host_identity_verified', and count the occurrences
grouped_hosts = verified_airbnb.groupby(['neighbourhood_group', 'host_identity_verified']).size().unstack()

# Create a heatmap to show the spread of verified and unverified hosts across neighborhood groups
plt.figure(figsize=(10, 6))
sns.heatmap(grouped_hosts, annot=True, cmap='YlGnBu', linewidths=0.5)

plt.xlabel('Host Identity')
plt.ylabel('Neighborhood Group')
plt.title('Spread of Verified and Unverified Hosts Across Neighborhood Groups')
plt.xticks(ticks=[0.5, 1.5], labels=['Unverified', 'Verified'])
plt.yticks(rotation=0)

plt.tight_layout()
plt.show()

In [2]:
pip install plotly.express

Collecting plotlyNote: you may need to restart the kernel to use updated packages.

  Downloading plotly-5.24.1-py3-none-any.whl.metadata (7.3 kB)
Collecting tenacity>=6.2.0 (from plotly)
  Downloading tenacity-9.0.0-py3-none-any.whl.metadata (1.2 kB)
Downloading plotly-5.24.1-py3-none-any.whl (19.1 MB)
   ---------------------------------------- 0.0/19.1 MB ? eta -:--:--
   ----------------- ---------------------- 8.1/19.1 MB 50.4 MB/s eta 0:00:01
   ---------------------------------------  18.9/19.1 MB 56.7 MB/s eta 0:00:01
   ---------------------------------------- 19.1/19.1 MB 50.1 MB/s eta 0:00:00
Downloading tenacity-9.0.0-py3-none-any.whl (28 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.24.1 tenacity-9.0.0
