In [None]:
#Necessary imports for the project

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#Step 1 : Cleaning the dataset and keeping important variables

Rentals_DF = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-07-05/rent.csv')
Rentals_DF.columns = ['post_id', 'date', 'year', 'nhood', 'city', 'county', 'price', 'beds', 'baths', 'sqft', 'room_in_apt', 'address', 'lat', 'lon', 'title', 'description', 'details']
#Rentals_DF.head()

Rentals_DF_clean = Rentals_DF.drop(columns=['post_id', 'date', 'room_in_apt', 'address', 'lat', 'lon', 'title', 'description', 'details'])

#Remove rows containing any NaN value
Rentals_DF_clean = Rentals_DF_clean[Rentals_DF_clean['baths'].notna()]
Rentals_DF_clean = Rentals_DF_clean[Rentals_DF_clean['county'].notna()]
Rentals_DF_clean = Rentals_DF_clean[Rentals_DF_clean['sqft'].notna()]
Rentals_DF_clean = Rentals_DF_clean[Rentals_DF_clean['beds'].notna()]
#Rentals_DF_clean.isnull().sum()

#Remove outliers -- IMPORTANT : I plotted the scatterplots for sqft vs price before this and the data was skewed 

from scipy import stats

# Calculate Z-scores for 'price' and 'sqft'
z_scores_price = np.abs(stats.zscore(Rentals_DF_clean['price']))
z_scores_sqft = np.abs(stats.zscore(Rentals_DF_clean['sqft']))

# Define a threshold for Z-scores (e.g., 3 standard deviations)
threshold = 3

# Create a mask to identify outliers
outliers_mask = (z_scores_price > threshold) | (z_scores_sqft > threshold)

# Remove outliers from the DataFrame
Rentals_DF_clean = Rentals_DF_clean[~outliers_mask]


Rentals_DF_clean.head()

In [None]:
#Step 2 : Initial data analysis

Rentals_DF_clean.describe()

# Univariate Analysis - Identification of relations between variables
# Histogram for 'price' variable
plt.figure(figsize=(8, 6))
sns.histplot(Rentals_DF_clean['price'], kde=True)
plt.title('Price Distribution')
plt.xlabel('Price')
plt.show()

# Box plot for 'bedrooms' vs 'price'
plt.figure(figsize=(8, 6))
sns.boxplot(x='beds', y='price', data=Rentals_DF_clean)
plt.title('Price vs Number of Bedrooms')
plt.xlabel('Bedrooms')
plt.ylabel('Price')
plt.show()

# Bar plot for 'neighborhood' counts
# Create a subset of the top 85 neighborhoods (if we try to plot them all its unreadable...)
top_neighborhoods = Rentals_DF_clean['nhood'].value_counts().nlargest(85)
plt.figure(figsize=(12, 6))
sns.countplot(x='nhood', data=Rentals_DF_clean, order=top_neighborhoods.index)
plt.title('Neighborhood Counts')
plt.xticks(rotation=90)
plt.show()


In [None]:
#Bivariate analysis

# Scatter plot for 'sqft' vs 'price'
plt.figure(figsize=(8, 6))
sns.scatterplot(x='sqft', y='price', data=Rentals_DF_clean)
plt.title('Price vs Square Footage')
plt.xlabel('Square Footage')
plt.ylabel('Price')
plt.show()

# Select the variables of interest
variables_of_interest = ['price', 'beds', 'baths', 'year', 'sqft']

# Create a pair plot
sns.pairplot(Rentals_DF_clean[variables_of_interest])
plt.suptitle("Pair Plot of Price, Bedrooms, Bathrooms, Year, and Sqft", y=1.02)
plt.show()
