# Tunisia Real Estate Analysis

## Data exploration:

### loading the dataset

In [1]:
data <- read.csv("dataset/tunisia-real-estate.csv", stringsAsFactors = FALSE)

### Viewing the structure of the dataset:

In [2]:
# View the first few rows of the dataset
head(data)

# View the structure of the dataset
str(data)

Unnamed: 0_level_0,Governorate,Delegation,Locality,Nature,Type.of.Real.Estate,Surface,Price,Inserted.On
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>
1,Tunis,Sidi El Bechir,Sidi El Bechir,Sale,2-room apartment,70,120000,06/10/2023
2,Sousse,Akouda,Akouda,Sale,2-room apartment,74,230000,05/10/2023
3,Tunis,La Marsa,La Marsa,Rental,Houses,400,6500,30/10/2023
4,Tunis,Carthage,Carthage,Rental,Surfaces,180,1400,02/11/2023
5,Sousse,Hammam Sousse,Hammam Sousse,Sale,3-room apartment,145,160000,26/06/2022
6,Tunis,La Marsa,La Marsa,Rental,2-room apartment,120,1500,18/09/2023


'data.frame':	25317 obs. of  8 variables:
 $ Governorate        : chr  "Tunis" "Sousse" "Tunis" "Tunis" ...
 $ Delegation         : chr  "Sidi El Bechir" "Akouda" "La Marsa" "Carthage" ...
 $ Locality           : chr  "Sidi El Bechir" "Akouda" "La Marsa" "Carthage" ...
 $ Nature             : chr  "Sale" "Sale" "Rental" "Rental" ...
 $ Type.of.Real.Estate: chr  "2-room apartment" "2-room apartment" "Houses" "Surfaces" ...
 $ Surface            : num  70 74 400 180 145 ...
 $ Price              : num  120000 230000 6500 1400 160000 1500 300 570000 150 650 ...
 $ Inserted.On        : chr  "06/10/2023" "05/10/2023" "30/10/2023" "02/11/2023" ...


### Summary statistics:

In [3]:
# Summary statistics for numerical variables
summary(data$Price)

# Summary statistics for categorical variables
table(data$Governorate)

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
       2     1500   100000   422246   428000 16000000 


     Ariana        Beja   Ben arous     Bizerte       Gabes       Gafsa 
       4605          86        2458         500          54          34 
   Jendouba    Kairouan   Kasserine      Kebili      Le Kef      Mahdia 
         35          18          13           3          69          95 
    Manouba    Medenine    Monastir      Nabeul        Sfax Sidi bouzid 
        512         153         167        5878         452          22 
    Siliana      Sousse   Tataouine      Tozeur       Tunis    Zaghouan 
         25        1378           6           9        8561         184 

## Data cleaning:

### Checking for missing values:

In [4]:
# Check for missing values in the entire dataset
any(is.na(data))

# Check for missing values in specific columns
colSums(is.na(data))

### Checking for duplicates:

In [5]:
# Check for duplicated rows
duplicated_rows <- data[duplicated(data), ]

### Rmeoving duplicates:

In [6]:
# Remove duplicated rows
data <- unique(data)

### Converting data types:

In [7]:
# Convert columns to appropriate data types
data$Price <- as.numeric(data$Price)
# Convert Date/Time columns
data$Inserted.On <- as.POSIXct(data$Inserted.On, format = "%Y-%m-%d %H:%M:%S")

## Descriptive Analysis:

### importing libraries

In [11]:
library(ggplot2)

"package 'ggplot2' was built under R version 4.3.3"


### Summary Statistics for Price Variable:

In [8]:
# Summary statistics for price
summary(data$Price)

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
       2     1500   100000   422246   428000 16000000 

### Frequency Table for Categorical Variables (Governorate, Delegation, Locality):

In [9]:
# Frequency table for Governorate
table_governorate <- table(data$Governorate)
print(table_governorate)

# Frequency table for Delegation
table_delegation <- table(data$Delegation)
print(table_delegation)

# Frequency table for Locality
table_locality <- table(data$Locality)
print(table_locality)


     Ariana        Beja   Ben arous     Bizerte       Gabes       Gafsa 
       4605          86        2458         500          54          34 
   Jendouba    Kairouan   Kasserine      Kebili      Le Kef      Mahdia 
         35          18          13           3          69          95 
    Manouba    Medenine    Monastir      Nabeul        Sfax Sidi bouzid 
        512         153         167        5878         452          22 
    Siliana      Sousse   Tataouine      Tozeur       Tunis    Zaghouan 
         25        1378           6           9        8561         184 

                Agareb             Ain Draham           Ain Zaghouan 
                     6                      2                   1245 
                Akouda           Ariana Ville               Bab Bhar 
                   270                   2122                    788 
            Bab Souika                 Bargou              Beja Nord 
                    67                      1                   

### Bar Plots for Categorical Variables (Governorate, Delegation, Locality):

In [24]:
# Bar plot for Governorate
plot_governorate <- ggplot(data, aes(x = Governorate)) +
  geom_bar() +
  ggtitle("Frequency of Real Estate Transactions by Governorate") +
  xlab("Governorate") +
  ylab("Frequency")

# Save the bar plot for Governorate as an image
ggsave("plots/bar_plot_governorate.png", plot_governorate, width = 10, height = 6)

# Bar plot for Delegation
plot_delegation <- ggplot(data, aes(x = Delegation)) +
  geom_bar() +
  ggtitle("Frequency of Real Estate Transactions by Delegation") +
  xlab("Delegation") +
  ylab("Frequency")

# Save the bar plot for Delegation as an image
ggsave("plots/bar_plot_delegation.png", plot_delegation, width = 10, height = 6)

# Bar plot for Locality
plot_locality <- ggplot(data, aes(x = Locality)) +
  geom_bar() +
  ggtitle("Frequency of Real Estate Transactions by Locality") +
  xlab("Locality") +
  ylab("Frequency") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
  axis.title.x = element_text(size = 7, face = "bold"),
  axis.title.y = element_text(size = 10, face = "bold"),
  plot.title = element_text(size = 12, face = "bold"))

# Save the bar plot for Locality as an image
ggsave("plots/bar_plot_locality.png", plot_locality, width = 10, height = 6)


### Box Plot for Numerical Variable (Price) by Governorate:

In [22]:
# Create the box plot for Price by Governorate with smaller text size
plot_price_governorate <- ggplot(data, aes(x = Governorate, y = Price)) +
  geom_boxplot() +
  ggtitle("Price Distribution by Governorate") +
  xlab("Governorate") +
  ylab("Price") +
  scale_y_continuous(labels = scales::comma) +
  theme(axis.text = element_text(size = 7),  # Adjust text size for axis labels
        axis.title = element_text(size = 10), # Adjust text size for axis titles
        plot.title = element_text(size = 15)) # Adjust text size for plot title

# Save the box plot with custom dimensions (e.g., width = 10 inches, height = 6 inches)
ggsave("plots/box_plot_price_governorate.png", plot_price_governorate, width = 12, height = 8)
