# Explore here

### 1. Problem Statement and Data Collection

#### 1.1 The problem
We'll be analysing a dataset containing Airbnb's listings for the city of New York during 2019 in order to understand if we can get any useful information from it.


In [None]:
# 1.2 Data Collection

import pandas as pd

# import dataset
total_data = pd.read_csv("localhost")

total_data.head()

### 2. Exploration and data cleaning

We'll try to understand the data and its main features.

In [None]:
# Obtain the dimensions of the dataset
total_data.shape

In [None]:
# Obtain information about data types and non-null values
total_data.info()

First deductions

Let's do a quick search of duplicates.

In [None]:
total_data.duplicated().sum()

Deductions

In [None]:
total_data.drop(['id'], axis=1, inplace=True)
total_data.info()
total_data.head()

Now that our dataset seems to be neat and clean. Let's move on to the next part:

### 3. Analysis of univariate variables
#### 3.1 Analysis on categorical variables
Let's plot the categorical variables in our dataset and see what they can show us.

In [None]:
import matplotlib.pyplot as plt 
import seaborn as sns

# group by neighborhoods
abnb_data_neighborhoods = total_data.groupby(['neighbourhood'])['name'].count().reset_index(name='count').sort_values(['count'], ascending=False)
# reindex
abnb_data_neighborhoods = abnb_data_neighborhoods.reset_index(drop=True)
# get top quarter
abnb_data_neighborhoods = abnb_data_neighborhoods[0:len(abnb_data_neighborhoods)//4]
# truncate labels
abnb_data_neighborhoods['neighbourhood'] = abnb_data_neighborhoods['neighbourhood'].apply(lambda x: x[:8])

fig, axs = plt.subplots(3, figsize = (10, 7))

# Create a multiple histogram
sns.histplot(ax = axs[0], data = total_data, x = "neighbourhood_group").set(xlabel = 'Neighborhood Group')
sns.barplot(ax = axs[1],data=abnb_data_neighborhoods, x = "neighbourhood", y = abnb_data_neighborhoods['count']).set(xlabel = 'Neighborhood')
axs[1].tick_params(labelrotation=45, labelsize=8)
sns.histplot(ax = axs[2], data = total_data, x = "room_type").set(xlabel = 'Room Type')

# Adjust the layout
plt.tight_layout()

# Show the plot
plt.show()

Deductions

#### 3.2 Analysis on numerical variables
Now let's plot the numerical variables. For this part lets focus on the columns that have a numeric data type in the dataframe.

In [None]:
#abnb_data.info()

fig, axs = plt.subplots(4, 4, figsize = (15, 10))
fig.delaxes(axs[2, 3])
fig.delaxes(axs[3, 3])

# Creating a multiple figure with histograms and box plots
sns.histplot(ax = axs[0, 0], data = total_data, x = "latitude").set(xlabel = None)
sns.boxplot(ax = axs[1, 0], data = total_data, x = "latitude").set_box_aspect(0.2)

sns.histplot(ax = axs[0, 1], data = total_data, x = "longitude").set(xlabel = None, ylabel = None)
sns.boxplot(ax = axs[1, 1], data = total_data, x = "longitude").set_box_aspect(0.2)

sns.histplot(ax = axs[0, 2], data = total_data, x = "price").set(xlabel = None, ylabel = None)
sns.boxplot(ax = axs[1, 2], data = total_data, x = "price").set_box_aspect(0.2)

sns.histplot(ax = axs[0, 3], data = total_data, x = "minimum_nights").set(xlabel = None, ylabel = None)
sns.boxplot(ax = axs[1, 3], data = total_data, x = "minimum_nights").set_box_aspect(0.2)

sns.histplot(ax = axs[2, 0], data = total_data, x = "number_of_reviews").set(xlabel = None, ylabel = None)
sns.boxplot(ax = axs[3, 0], data = total_data, x = "number_of_reviews").set_box_aspect(0.2)

sns.histplot(ax = axs[2, 1], data = total_data, x = "calculated_host_listings_count").set(xlabel = None, ylabel = None)
sns.boxplot(ax = axs[3, 1], data = total_data, x = "calculated_host_listings_count").set_box_aspect(0.2)

sns.histplot(ax = axs[2, 2], data = total_data, x = "availability_365").set(xlabel = None, ylabel = None)
sns.boxplot(ax = axs[3, 2], data = total_data, x = "availability_365").set_box_aspect(0.2)

# Adjust the layout
plt.tight_layout()

# Show the plot
plt.show()

Deductions

### 4. Analysis of multivariate variables
Now let's see how the variables relate to each other. 

#### 4.1 Numerical-Numerical
Let's start with the numerical variables.

In [None]:
fig, axis = plt.subplots(2, 2, figsize = (10, 7))

# Create a multiple scatter diagram
sns.regplot(ax = axis[0, 0], data = total_data, x = "latitude", y = "longitude")
sns.heatmap(total_data[["latitude", "longitude"]].corr(), annot = True, fmt = ".2f", ax = axis[1, 0], cbar = False)

sns.regplot(ax = axis[0, 1], data = total_data, x = "availability_365", y = "price")
sns.heatmap(total_data[["availability_365", "price"]].corr(), annot = True, fmt = ".2f", ax = axis[1, 1], cbar = False)

# Adjust the layout
plt.tight_layout()

# Show the plot
plt.show()

Deductions

In [None]:
fig, axis = plt.subplots(2, 2, figsize = (10, 7))

# Create a multiple scatter diagram
sns.regplot(ax = axis[0, 0], data = total_data, x = "number_of_reviews", y = "price")
sns.heatmap(total_data[["number_of_reviews", "price"]].corr(), annot = True, fmt = ".2f", ax = axis[1, 0], cbar = False)

sns.regplot(ax = axis[0, 1], data = total_data, x = "calculated_host_listings_count", y = "price")
sns.heatmap(total_data[["calculated_host_listings_count", "price"]].corr(), annot = True, fmt = ".2f", ax = axis[1, 1], cbar = False)

# Adjust the layout
plt.tight_layout()

# Show the plot
plt.show()

Deductions

#### 4.2 Categorical-categorical
Let's have a look on categorical variables and see what we can find out.

In [None]:
abnb_data_avg_price1 = total_data.groupby(['neighbourhood_group'])['room_type'].count().reset_index(name='avg_price')

sns.countplot(data = total_data, x = "room_type", hue = "neighbourhood_group")

plt.tight_layout()

plt.show()

Deductions

#### 4.3 Categorical-numerical
Let's see how categorical variables relate with the numerical ones.

In [None]:
abnb_data_avg_price1 = total_data.groupby(['neighbourhood_group'])['price'].mean().reset_index(name='avg_price')
abnb_data_avg_price2 = total_data.groupby(['room_type'])['price'].mean().reset_index(name='avg_price')

fig, axis = plt.subplots(2, figsize = (15, 7))

sns.barplot(ax = axis[0], data = abnb_data_avg_price1, x = "neighbourhood_group", y = abnb_data_avg_price1["avg_price"])
sns.barplot(ax = axis[1], data = abnb_data_avg_price2, x = "room_type", y = abnb_data_avg_price2["avg_price"]).set(ylabel = None)

plt.tight_layout()

plt.show()

Deductions

#### 4.4 Correlation analysis

Let's take a look at how each variable correlates wih each other whenever it makes sense:

In [None]:
abnb_data_factorised = total_data
abnb_data_factorised["neighbourhood"] = pd.factorize(total_data["neighbourhood"])[0]
abnb_data_factorised["neighbourhood_group"] = pd.factorize(total_data["neighbourhood_group"])[0]
abnb_data_factorised["room_type"] = pd.factorize(total_data["room_type"])[0]

fig, axis = plt.subplots(figsize = (10, 6))

sns.heatmap(abnb_data_factorised[[
    "neighbourhood_group", 
    "neighbourhood", 
    "latitude", 
    "longitude", 
    "room_type",
    "price"
    ]].corr(), annot = True, fmt = ".2f")

plt.tight_layout()


plt.show()

Deductions

Now, let's do the same with the other variables:

In [None]:
abnb_data_factorised["name"] = pd.factorize(total_data["name"])[0]
abnb_data_factorised["host_name"] = pd.factorize(total_data["host_name"])[0]
abnb_data_factorised["room_type"] = pd.factorize(total_data["room_type"])[0]
abnb_data_factorised["last_review"] = pd.factorize(total_data["last_review"])[0]

fig, axis = plt.subplots(figsize = (10, 6))

sns.heatmap(abnb_data_factorised[[
    "name", 
    "host_name",  
    "room_type",
    "price",
    "price",
    "number_of_reviews",
    "last_review",
    "reviews_per_month",
    "calculated_host_listings_count",
    "availability_365"
    ]].corr(), annot = True, fmt = ".2f")

plt.tight_layout()

Deductions

### 5. Feature engineering
#### 5.1 Missing value analysis.

Let's analyse the null values and see if we can reduce them:

In [None]:
null_values = abnb_data_factorised[total_data.isna().any(axis=1)]
print(null_values.shape)
null_values.head()

Deductions

Find sparse nulls

In [None]:
data_without_reviews = abnb_data_factorised.drop(['last_review', 'reviews_per_month'], axis=1, inplace=False)
null_values = data_without_reviews[data_without_reviews.isna().any(axis=1)]
print(null_values.shape)
null_values.head()

Deductions

Drop sparse nulls

In [None]:
abnb_data_factorised.drop(null_values.index, inplace=True)
abnb_data_factorised.shape

Deductions

Drop null columns

In [None]:
abnb_data_factorised.drop(['last_review', 'reviews_per_month'], axis=1, inplace=True)
abnb_data_factorised.info()

#### 5.2 Feature engineering
Rather than adding new features first, let's start by getting rid of those that don't seem to add any value as we saw in step 4.

In [None]:
abnb_data_factorised.drop(['name', 'host_name', 'latitude', 'longitude'], axis=1, inplace=True)
abnb_data_factorised.info()

#### 5.3 Outlier analysis
Now that we have a tidier dataset, we can proceed to study the outliers.

In [None]:
abnb_data_factorised.head()
abnb_data_factorised.describe()

Deductions

In [None]:
abnb_data_factorised[abnb_data_factorised['price'] == 0]['price'].count()

Since they're very few we can remove them and measure again.

In [None]:
abnb_data_factorised = abnb_data_factorised[abnb_data_factorised['price'] != 0]
abnb_data_factorised.describe()

Deductions

Let's represent it in a graph:

In [None]:
fig, axis = plt.subplots(2, 3, figsize = (15, 10))

sns.boxplot(ax = axis[0, 0], data = abnb_data_factorised, y = "price")
sns.boxplot(ax = axis[0, 1], data = abnb_data_factorised, y = "minimum_nights")
sns.boxplot(ax = axis[0, 2], data = abnb_data_factorised, y = "number_of_reviews")
sns.boxplot(ax = axis[1, 0], data = abnb_data_factorised, y = "calculated_host_listings_count")
sns.boxplot(ax = axis[1, 1], data = abnb_data_factorised, y = "availability_365")
fig.delaxes(ax = axis[1,2])

plt.tight_layout()

plt.show()

Deductions

Get limits for outliers:

In [None]:
# For price
price_stats = abnb_data_factorised["price"].describe()

price_iqr = price_stats["75%"] - price_stats["25%"]
upper_limit = price_stats["75%"] + 1.5 * price_iqr
lower_limit = price_stats["25%"] - 1.5 * price_iqr

print(f"The upper and lower limits for finding outliers are {round(upper_limit, 2)} and {round(lower_limit, 2)}, with an interquartile range of {round(price_iqr, 2)}")

Let's see how many values are above the upper limit.

In [None]:
print(abnb_data_factorised[abnb_data_factorised["price"] > 334].shape)
abnb_data_factorised[abnb_data_factorised["price"] > 334][abnb_data_factorised["price"] < 1000].shape

Deductions

Remove outliers

In [None]:
abnb_data_factorised = abnb_data_factorised[abnb_data_factorised["price"] < 1000]
abnb_data_factorised.describe()

#### 5.4 Inference of new features

Let's look again at our dataset to see if we can infer any new features.

In [None]:
print(abnb_data_factorised.info())
abnb_data_factorised.head()

Deductions

#### 5.5 Feature scaling
Let's scale our features now

In [None]:
from sklearn.preprocessing import MinMaxScaler

num_variables = list(abnb_data_factorised.columns)
num_variables.remove('price')

scaler = MinMaxScaler()
norm_features = scaler.fit_transform(abnb_data_factorised[num_variables])
total_data_norm = pd.DataFrame(norm_features, index = abnb_data_factorised.index, columns = num_variables)
total_data_norm["price"] = abnb_data_factorised["price"]
total_data_norm.head()

### 6. Feature Selection
Now that we've scaled the dataset, we can select the most relevant features.

In [None]:
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.model_selection import train_test_split

# We divide the dataset into training and test samples.
X = total_data_norm.drop("price", axis = 1)
y = total_data_norm["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

selection_model = SelectKBest(chi2, k = 4)
selection_model.fit(X_train, y_train)
ix = selection_model.get_support()
X_train_sel = pd.DataFrame(selection_model.transform(X_train), columns = X_train.columns.values[ix])
X_test_sel = pd.DataFrame(selection_model.transform(X_test), columns = X_test.columns.values[ix])

X_train_sel.head()