<a href="https://colab.research.google.com/github/AzlinRusnan/Data-Mining/blob/main/Famous_Local_Brand_Text_Mining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
library(rvest)
library(dplyr)
library(stringr)

In [3]:
pages <- paste0('https://www.rrbyrizmanruzaini.com/collections/rr-womenswear?page=', 0:2)

In [4]:
# Function to determine the category based on the product name
get_category <- function(product_name) {
  if (grepl("Kurung Pahang", product_name)) {
    "Kurung Pahang"
  } else if (grepl("Kurung Modern", product_name)) {
    "Kurung Moden"
  } else if (grepl("Kurung Kedah", product_name)) {
    "Kurung Kedah"
  } else if (grepl("Set Basic", product_name)) {
    "Set Basic"
  } else {
    "Other"
  }
}

In [5]:
# Function to scrape product names, before sale prices, and after sale prices
Price <- function(page) {
  url <- read_html(page)

  # Extracting product names
  product_nodes <- html_nodes(url, '.grid-product__title') # Adjust if needed
  products <- html_text(product_nodes, trim = TRUE)

  # Extracting prices
  price_nodes <- html_nodes(url, '.grid-product__price')
  prices <- html_text(price_nodes, trim = TRUE)

  # Clean and extract before and after sale prices
  prices_before_sale <- sapply(prices, function(x) {
    if (grepl("Regular price", x)) {
      sub("Regular price\\s+RM\\s*", "", str_extract(x, "Regular price\\s+RM\\s*[0-9,.]+"))
    } else {
      NA
    }
  })
  prices_after_sale <- sapply(prices, function(x) {
    if (grepl("Sale price", x)) {
      sub("Sale priceRM\\s*", "", str_extract(x, "Sale priceRM\\s*[0-9,.]+"))
    } else {
      NA
    }
  })

  # Determining the category for each product
  categories <- sapply(products, get_category)

  # Create a dataframe
  data_frame(Product = products, PriceBeforeSale = prices_before_sale, PriceAfterSale = prices_after_sale, Category = categories)
}

In [9]:
# Apply the function to each page and combine the results
product_data <- do.call(rbind, lapply(pages, Price))
head(product_data)

Product,PriceBeforeSale,PriceAfterSale,Category
<chr>,<chr>,<chr>,<chr>
RR Baju Kurung Pahang Songket in Navy Silver,399.0,239.4,Kurung Pahang
RR Baju Kurung Pahang Songket in Brown Black,399.0,239.4,Kurung Pahang
RR Baju Kurung Pahang Songket in Black Gold,399.0,239.4,Kurung Pahang
RR Baju Kurung Pahang Forest in Green,399.0,239.4,Kurung Pahang
RR Baju Kurung Pahang Forest in Brown,399.0,239.4,Kurung Pahang
RR Baju Kurung Modern Pleated Ivy in Light Blue,439.0,263.4,Kurung Moden


In [10]:
library(dplyr)
library(ggplot2)
library(RColorBrewer)

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
product <- read.csv(file.choose())
str(product)
summary(product)

In [12]:
##################################################
        ## Analysis by Category ##
##################################################
categories_analysis_actual <- product_data %>%
  filter(Category %in% c('Kurung Pahang', 'Kurung Moden')) %>%
  group_by(Category) %>%
  summarise(
    Count = n(),
    Average_PriceBeforeSale = mean(PriceBeforeSale, na.rm = TRUE),
    Average_PriceAfterSale = mean(PriceAfterSale, na.rm = TRUE)
  ) %>%
  arrange(desc(Count))

# Calculating the price reduction percentage
categories_analysis_actual$Price_Reduction_Percent <- round(
  (1 - (categories_analysis_actual$Average_PriceAfterSale / categories_analysis_actual$Average_PriceBeforeSale)) * 100, 2)

head(categories_analysis_actual)


[1m[22m[36mℹ[39m In argument: `Average_PriceBeforeSale = mean(PriceBeforeSale, na.rm = TRUE)`.
[36mℹ[39m In group 1: `Category = "Kurung Moden"`.
[33m![39m argument is not numeric or logical: returning NA


Category,Count,Average_PriceBeforeSale,Average_PriceAfterSale,Price_Reduction_Percent
<chr>,<int>,<dbl>,<dbl>,<dbl>
Kurung Moden,63,,,
Kurung Pahang,62,,,
