In [None]:
# Load necessary libraries
library(ggplot2)  # for visualization
library(dplyr)    # for data manipulation
library(caret)    # for normalization
library(tidyr)    # for tidying data

# Load the Diamond dataset (available in R's ggplot2 package)
data(diamonds)

# 1. Data Summarization
# Check the structure of the data
str(diamonds)

# Check the summary statistics of the dataset
summary(diamonds)
# a. Distribution of the price variable (numerical)
ggplot(diamonds, aes(x = price)) +
  geom_histogram(bins = 30, fill = "skyblue", color = "black") +
  ggtitle("Distribution of Diamond Prices") +
  theme_minimal()

# Insight:
# - The price distribution is right-skewed, with most diamonds being priced lower.
# - A small number of diamonds are very expensive, creating a long tail on the right side.

# b. Scatter plot for price vs. carat
ggplot(diamonds, aes(x = carat, y = price)) +
  geom_point(alpha = 0.1, color = "blue") +
  ggtitle("Price vs Carat") +
  theme_minimal()

# Insight:
# - There is a strong positive correlation between carat and price.
# - As the carat size increases, the price of diamonds increases significantly.

# c. Boxplot for price by cut
ggplot(diamonds, aes(x = cut, y = price, fill = cut)) +
  geom_boxplot() +
  ggtitle("Price Distribution by Cut Quality") +
  theme_minimal()

# Insight:
# - The price varies significantly by cut quality. Diamonds with higher quality cuts (Ideal and Premium) tend to have higher prices.
# - "Fair" and "Good" cuts are associated with lower prices.

# d. Bar plot for count of diamonds by color
ggplot(diamonds, aes(x = color, fill = color)) +
  geom_bar() +
  ggtitle("Count of Diamonds by Color") +
  theme_minimal()

# Insight:
# - The most common color grades are G and H, with fewer diamonds in the D and I categories.
# - There are very few diamonds in the color grade Z.

# 3. Data Normalization

# Normalize numerical variables (price, carat, depth, table, x, y, z)
# First, select numerical columns
numerical_columns <- diamonds %>%
  select(price, carat, depth, table, x, y, z)

# Normalize using caret's preProcess function
pre_process <- preProcess(numerical_columns, method = c("center", "scale"))
normalized_data <- predict(pre_process, numerical_columns)

# Insight:
# - Normalizing data ensures that all numerical variables are on the same scale.
# - This step is useful for algorithms that are sensitive to feature scaling (e.g., clustering, PCA).

# 4. Correlation Analysis

# Compute the correlation matrix for numerical variables
cor_matrix <- cor(numerical_columns)

# Plot the correlation matrix using a heatmap
library(ggcorrplot)
ggcorrplot(cor_matrix, lab = TRUE, type = "lower", lab_size = 3,
           title = "Correlation Matrix of Numerical Variables",
           colors = c("red", "white", "green"))

# Insight:
# - Strong correlations are observed between 'carat' and 'price' (0.92).
# - 'x', 'y', and 'z' are highly correlated with each other, suggesting that these are related to the physical dimensions of the diamond.
# - 'depth' and 'table' have weaker correlations with other variables.

# 5. Handling Missing Data (if applicable)
# Check for missing values
sum(is.na(diamonds))

# Insight:
# - There are no missing values in the diamonds dataset, as expected for well-curated datasets.

# 6. Outlier Detection (optional)
# Boxplot to detect potential outliers in price
ggplot(diamonds, aes(x = "", y = price)) +
  geom_boxplot(fill = "salmon") +
  ggtitle("Outlier Detection for Price") +
  theme_minimal()

# Insight:
# - There are potential outliers in the price variable, but they could represent rare high-value diamonds.
# - We can choose to exclude or keep these outliers based on the context of the analysis.

# Final Insight:
# - The diamonds dataset contains rich information that can help in understanding the factors that influence diamond prices.
# - Carat size is a major determinant of price, and the cut quality significantly impacts pricing as well.
# - The dataset is free of missing values, but we do have potential outliers in price, which could be high-value diamonds or errors.
# - Normalizing the data helps to standardize the features, especially when using machine learning algorithms that require this step.
