# Uncovering the Drivers of Housing Prices in Beijing: The Influence of Location and Time

In [1]:
# Import packages
library(tidyverse)
library(lubridate) 
library(repr)
library(corrplot)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.4.4     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
corrplot 0.92 loaded



## Read and Load Data Set

In [2]:
# Load the dataset
housing_data <- read_csv("dsci310-group02-project/data/new.csv", locale = locale(encoding = "UTF-8")) %>% mutate(floor = str_trim(str_extract(floor,"( .*)"), side = "both"))

# Explore the structure and summary statistics of the dataset
head(housing_data)
tail(housing_data)
summary(housing_data)

ERROR: Error: 'dsci310-group02-project/data/new.csv' does not exist in current working directory ('/Users/prabhjotsingh/dsci310-group02-project').


## Cleaning and Wrangling Data

In [None]:
# Remove some not useful columns
housing_data <- select(housing_data, -url, -id, -Cid)

# Convert character variables to appropriate types
housing_data <- housing_data %>%
  mutate(tradeTime = as.Date(tradeTime),
         livingRoom = as.integer(livingRoom),
         drawingRoom = as.integer(drawingRoom),
         bathroom = as.integer(bathRoom),
         constructionTime = as.integer(constructionTime))

# Also remove variables with Chinese characters
housing_data <- select(housing_data, -floor)

# Check for missing values
summary(housing_data)

In [None]:
# Visualize distribution of numeric variables using histograms
ggplot(housing_data, aes(x = totalPrice)) +
  geom_histogram(binwidth = 1000, fill = "skyblue", color = "black") +
  labs(title = "Distribution of Total Price",
       x = "Total Price (Tens of Thousands RMB)",
       y = "Frequency")

# Explore relationships between variables using scatter plots
ggplot(housing_data, aes(x = square, y = totalPrice)) +
  geom_point() +
  labs(title = "Total Price vs. Square",
       x = "Square",
       y = "Total Price (Tens of Thousands RMB)")

In [None]:
# Computing correlation matrix for numeric variables
numeric_vars <- select(housing_data, where(is.numeric))
corr_matrix <- cor(numeric_vars, use = "complete.obs") # use="complete.obs" handles missing values by using available data

# Visualize the correlation matrix
corrplot(corr_matrix, method = "color", type = "upper", order = "hclust", 
         tl.col = "black", tl.srt = 45, 
         col = colorRampPalette(c("#6D9EC1", "white", "#E46726"))(200))

In [None]:
# Box plot for the relationship between the number of living rooms and total price
ggplot(housing_data, aes(x = as.factor(livingRoom), y = totalPrice)) +
  geom_boxplot() +
  labs(title = "Total Price by Number of Living Rooms",
       x = "Number of Living Rooms",
       y = "Total Price") +
  theme_minimal()

# Box plot for the relationship between the number of drawing rooms and total price
ggplot(housing_data, aes(x = as.factor(drawingRoom), y = totalPrice)) +
  geom_boxplot() +
  labs(title = "Total Price by Number of Drawing Rooms",
       x = "Number of Drawing Rooms",
       y = "Total Price") +
  theme_minimal()

# Box plot for the relationship between the number of bathrooms and total price
ggplot(housing_data, aes(x = as.factor(bathRoom), y = totalPrice)) +
  geom_boxplot() +
  labs(title = "Total Price by Number of Bathrooms",
       x = "Number of Bathrooms",
       y = "Total Price") +
  theme_minimal()

In [None]:
# Linear regression model: totalPrice ~ .
model <- lm(totalPrice ~ ., data = housing_data)

# Summary of the regression model
summary(model)

In [None]:
# Predicting totalPrice using the model
housing_data$predictedPrice <- predict(model, housing_data)

# Visualization of Actual vs. Predicted Prices
ggplot(housing_data, aes(x = totalPrice, y = predictedPrice)) +
  geom_point(alpha = 0.5) +
  geom_abline(color = "red") +
  labs(title = "Actual vs. Predicted Total Price",
       x = "Actual Total Price",
       y = "Predicted Total Price") +
  theme_minimal()