# **IDS Lab Week 5**

### ***1. Handles missing and invalid values using the vtreat package.***

In [None]:
install.packages("vtreat")
library(vtreat)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependency ‘wrapr’


Loading required package: wrapr


Attaching package: ‘wrapr’


The following object is masked from ‘package:dplyr’:

    coalesce




In [None]:
set.seed(123)
data <- data.frame(
  ID = 1:100,
  Age = sample(c(18:75, NA), 100, replace = TRUE),
  Income = sample(c(30000:120000, NA), 100, replace = TRUE),
  Number_of_Vehicles = sample(c(0:3, NA), 100, replace = TRUE),
  Gas_Usage = sample(c(50:200, NA), 100, replace = TRUE)
)


In [None]:
head(data, 10)

Unnamed: 0_level_0,ID,Age,Income,Number_of_Vehicles,Gas_Usage
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>
1,1,48,69894,1.0,57
2,2,32,39639,1.0,163
3,3,68,115277,3.0,54
4,4,31,39325,2.0,78
5,5,20,56509,3.0,99
6,6,59,50959,2.0,119
7,7,67,44402,2.0,123
8,8,71,94501,2.0,75
9,9,60,101502,,122
10,10,54,75220,2.0,60


In [None]:
treat_plan <- design_missingness_treatment(data, varlist = c("Age", "Income",
"Number_of_Vehicles", "Gas_Usage"))

In [None]:
treat_plan

            origName                  varName    code
1                Age                      Age numeric
2             Income                   Income numeric
3 Number_of_Vehicles       Number_of_Vehicles numeric
4 Number_of_Vehicles Number_of_Vehicles_isBAD  is_bad
5          Gas_Usage                Gas_Usage numeric

In [None]:
treated_data <- prepare(treat_plan, data)

In [None]:
treated_data

ID,Age,Income,Number_of_Vehicles,Number_of_Vehicles_isBAD,Gas_Usage
<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,48,69894,1.000000,0,57
2,32,39639,1.000000,0,163
3,68,115277,3.000000,0,54
4,31,39325,2.000000,0,78
5,20,56509,3.000000,0,99
6,59,50959,2.000000,0,119
7,67,44402,2.000000,0,123
8,71,94501,2.000000,0,75
9,60,101502,1.571429,1,122
10,54,75220,2.000000,0,60


### ***2. Demonstrates data transformation***

In [None]:
 mean_age <- mean(treated_data$Age)
 mean_age

In [None]:
 sd_age <- sd(treated_data$Age)
 sd_age

In [None]:
mean_age + c(-sd_age, sd_age)

In [None]:
treated_data$scaled_age <- (treated_data$Age - mean_age) / sd_age
treated_data

ID,Age,Income,Number_of_Vehicles,Number_of_Vehicles_isBAD,Gas_Usage,scaled_age
<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,48,69894,1.000000,0,57,0.07683301
2,32,39639,1.000000,0,163,-0.92262076
3,68,115277,3.000000,0,54,1.32615022
4,31,39325,2.000000,0,78,-0.98508662
5,20,56509,3.000000,0,99,-1.67221109
6,59,50959,2.000000,0,119,0.76395748
7,67,44402,2.000000,0,123,1.26368436
8,71,94501,2.000000,0,75,1.51354780
9,60,101502,1.571429,1,122,0.82642334
10,54,75220,2.000000,0,60,0.45162817


In [None]:
treated_data[abs(treated_data["Age"] - mean_age) < sd_age, ]

Unnamed: 0_level_0,ID,Age,Income,Number_of_Vehicles,Number_of_Vehicles_isBAD,Gas_Usage,scaled_age,gp
Unnamed: 0_level_1,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,1,48,69894,1.0,0,57,0.07683301,0.13338432
2,2,32,39639,1.0,0,163,-0.92262076,0.63822518
4,4,31,39325,2.0,0,78,-0.98508662,0.59270076
6,6,59,50959,2.0,0,119,0.76395748,0.50386834
9,9,60,101502,1.571429,1,122,0.82642334,0.7400158
10,10,54,75220,2.0,0,60,0.45162817,0.02553237
12,12,31,110685,2.0,0,55,-0.98508662,0.87516184
14,14,42,62605,1.571429,1,168,-0.29796216,0.77641018
15,15,43,46151,0.0,0,137,-0.23549629,0.85928046
16,16,44,55558,3.0,0,116,-0.17303043,0.284626


In [None]:
treated_data %>%
  filter(abs(Age - mean_age) < sd_age) %>%
  select(Age, scaled_age) %>%
  head()


Unnamed: 0_level_0,Age,scaled_age
Unnamed: 0_level_1,<dbl>,<dbl>
1,48,0.07683301
2,32,-0.92262076
3,31,-0.98508662
4,59,0.76395748
5,60,0.82642334
6,54,0.45162817


In [None]:
treated_data %>%
  filter(abs(Age - mean_age) > sd_age) %>%
  select(Age, scaled_age) %>%
  head()


Unnamed: 0_level_0,Age,scaled_age
Unnamed: 0_level_1,<dbl>,<dbl>
1,68,1.32615
2,20,-1.672211
3,67,1.263684
4,71,1.513548
5,69,1.388616
6,71,1.513548


### ***3. Demonstrate Sampling***

In [None]:
set.seed(25643)
treated_data$gp <- runif(nrow(treated_data))
treated_test <- subset(treated_data, gp <= 0.1)
treated_train <- subset(treated_data, gp > 0.1)
dim(treated_train)
dim(treated_test)

### ***Group Level Sampling***


In [None]:
household_data <- data.frame(
  household_id = c("000000004", "000000023", "000000023", "000000327", "000000327",
                   "000000328", "000000328", "000000404", "000000424", "000000424"),
  customer_id = c("000000004_01", "000000023_01", "000000023_02", "000000327_01",
                  "000000327_02", "000000328_01", "000000328_02", "000000404_01",
                  "000000424_01", "000000424_02"),
  age = c(65, 43, 61, 30, 30, 62, 62, 82, 45, 38),
  income = c(940, 29000, 42000, 47000, 37400, 42500, 31800, 28600, 160000, 250000)
)
household_data

household_id,customer_id,age,income
<chr>,<chr>,<dbl>,<dbl>
4,000000004_01,65,940
23,000000023_01,43,29000
23,000000023_02,61,42000
327,000000327_01,30,47000
327,000000327_02,30,37400
328,000000328_01,62,42500
328,000000328_02,62,31800
404,000000404_01,82,28600
424,000000424_01,45,160000
424,000000424_02,38,250000


In [None]:
hh <- unique(household_data$household_id)
set.seed(243674)
households <- data.frame(
  household_id = hh,
  gp = runif(length(hh)),
  stringsAsFactors = FALSE
)
households


household_id,customer_id,age,income,gp
<chr>,<chr>,<dbl>,<dbl>,<dbl>
4,000000004_01,65,940,0.2063638
23,000000023_01,43,29000,0.4543296
23,000000023_02,61,42000,0.4543296
327,000000327_01,30,47000,0.9931105
327,000000327_02,30,37400,0.9931105
328,000000328_01,62,42500,0.6279021
328,000000328_02,62,31800,0.6279021
404,000000404_01,82,28600,0.5155618
424,000000424_01,45,160000,0.6541317
424,000000424_02,38,250000,0.6541317


In [None]:

household_data <- dplyr::left_join(household_data, households, by = "household_id")
household_data


### ***4. More Exercises on Data Cleaning:***

#### ***4.1 Handling Missing Values***

In [None]:
data(airquality)

In [None]:
str(airquality)

'data.frame':	153 obs. of  6 variables:
 $ Ozone  : int  41 36 12 18 NA 28 23 19 8 NA ...
 $ Solar.R: int  190 118 149 313 NA NA 299 99 19 194 ...
 $ Wind   : num  7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
 $ Temp   : int  67 72 74 62 56 66 65 59 61 69 ...
 $ Month  : int  5 5 5 5 5 5 5 5 5 5 ...
 $ Day    : int  1 2 3 4 5 6 7 8 9 10 ...


In [None]:
summary(airquality)

     Ozone           Solar.R           Wind             Temp      
 Min.   :  1.00   Min.   :  7.0   Min.   : 1.700   Min.   :56.00  
 1st Qu.: 18.00   1st Qu.:115.8   1st Qu.: 7.400   1st Qu.:72.00  
 Median : 31.50   Median :205.0   Median : 9.700   Median :79.00  
 Mean   : 42.13   Mean   :185.9   Mean   : 9.958   Mean   :77.88  
 3rd Qu.: 63.25   3rd Qu.:258.8   3rd Qu.:11.500   3rd Qu.:85.00  
 Max.   :168.00   Max.   :334.0   Max.   :20.700   Max.   :97.00  
 NA's   :37       NA's   :7                                       
     Month            Day      
 Min.   :5.000   Min.   : 1.0  
 1st Qu.:6.000   1st Qu.: 8.0  
 Median :7.000   Median :16.0  
 Mean   :6.993   Mean   :15.8  
 3rd Qu.:8.000   3rd Qu.:23.0  
 Max.   :9.000   Max.   :31.0  
                               

In [None]:
sum(is.na(airquality))

In [None]:
cleaned_data <- na.omit(airquality)

In [None]:
imputed_data <- airquality
for (col in names(imputed_data)) {
  if (any(is.na(imputed_data[[col]]))) {
    imputed_data[[col]][is.na(imputed_data[[col]])] <- mean(imputed_data[[col]], na.rm = TRUE)
  }
}
sum(is.na(imputed_data))


#### ***4.2 Removing Duplicates***

In [None]:
data <- data.frame(
  ID = c(1, 2, 3, 4, 5, 5, 6),
  Value = c(10, 20, 30, 40, 50, 50, 60)
)
data

ID,Value
<dbl>,<dbl>
1,10
2,20
3,30
4,40
5,50
5,50
6,60


In [None]:
duplicates <- duplicated(data)
duplicates

In [None]:
cleaned_data <- data[!duplicates, ]
cleaned_data

Unnamed: 0_level_0,ID,Value
Unnamed: 0_level_1,<dbl>,<dbl>
1,1,10
2,2,20
3,3,30
4,4,40
5,5,50
7,6,60


#### ***4.3 Correcting Data Types***

In [None]:
data <- data.frame(
  ID = as.character(1:5),
  Value = as.factor(c(10, 20, 30, 40, 50))
)
data

ID,Value
<chr>,<fct>
1,10
2,20
3,30
4,40
5,50


In [None]:
data$ID <- as.numeric(data$ID)
data$Value <- as.numeric(as.character(data$Value))
data

ID,Value
<dbl>,<dbl>
1,10
2,20
3,30
4,40
5,50


#### ***4.4 Scaling and Normalization***

In [None]:
data <- mtcars
data

Unnamed: 0_level_0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2
Valiant,18.1,6,225.0,105,2.76,3.46,20.22,1,0,3,1
Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4
Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4


In [None]:
scaled_data <- scale(data)
scaled_data

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,0.15088482,-0.1049878,-0.57061982,-0.53509284,0.56751369,-0.610399567,-0.77716515,-0.8680278,1.1899014,0.4235542,0.7352031
Mazda RX4 Wag,0.15088482,-0.1049878,-0.57061982,-0.53509284,0.56751369,-0.349785269,-0.46378082,-0.8680278,1.1899014,0.4235542,0.7352031
Datsun 710,0.44954345,-1.2248578,-0.99018209,-0.78304046,0.47399959,-0.917004624,0.42600682,1.1160357,1.1899014,0.4235542,-1.1221521
Hornet 4 Drive,0.21725341,-0.1049878,0.22009369,-0.53509284,-0.96611753,-0.002299538,0.89048716,1.1160357,-0.8141431,-0.9318192,-1.1221521
Hornet Sportabout,-0.23073453,1.0148821,1.04308123,0.41294217,-0.83519779,0.227654255,-0.46378082,-0.8680278,-0.8141431,-0.9318192,-0.5030337
Valiant,-0.3302874,-0.1049878,-0.04616698,-0.60801861,-1.56460776,0.248094592,1.32698675,1.1160357,-0.8141431,-0.9318192,-1.1221521
Duster 360,-0.96078893,1.0148821,1.04308123,1.43390296,-0.72298087,0.360516446,-1.12412636,-0.8680278,-0.8141431,-0.9318192,0.7352031
Merc 240D,0.71501778,-1.2248578,-0.67793094,-1.23518023,0.17475447,-0.027849959,1.20387148,1.1160357,-0.8141431,0.4235542,-0.5030337
Merc 230,0.44954345,-1.2248578,-0.72553512,-0.75387015,0.60491932,-0.068730634,2.82675459,1.1160357,-0.8141431,0.4235542,-0.5030337
Merc 280,-0.1477738,-0.1049878,-0.50929918,-0.34548584,0.60491932,0.227654255,0.25252621,1.1160357,-0.8141431,0.4235542,0.7352031


In [None]:
normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x)))
}
normalized_data <- as.data.frame(lapply(data, normalize))
normalized_data

mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
0.4510638,0.5,0.22175106,0.204947,0.52534562,0.28304781,0.23333333,0,1,0.5,0.4285714
0.4510638,0.5,0.22175106,0.204947,0.52534562,0.34824853,0.3,0,1,0.5,0.4285714
0.5276596,0.0,0.0920429,0.14487633,0.50230415,0.20634109,0.48928571,1,1,0.5,0.0
0.4680851,0.5,0.46620105,0.204947,0.14746544,0.43518282,0.58809524,1,0,0.0,0.0
0.3531915,1.0,0.72062859,0.43462898,0.1797235,0.49271286,0.3,0,0,0.0,0.1428571
0.3276596,0.5,0.38388626,0.18727915,0.0,0.49782664,0.68095238,1,0,0.0,0.0
0.1659574,1.0,0.72062859,0.6819788,0.20737327,0.52595244,0.15952381,0,0,0.0,0.4285714
0.5957447,0.0,0.1885757,0.03533569,0.42857143,0.42879059,0.6547619,1,0,0.5,0.1428571
0.5276596,0.0,0.17385882,0.15194346,0.53456221,0.41856303,1.0,1,0,0.5,0.1428571
0.3744681,0.5,0.24070841,0.25088339,0.53456221,0.49271286,0.45238095,1,0,0.5,0.4285714


#### ***4.5 Data Splitting***

In [None]:
data <- iris
data

Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
<dbl>,<dbl>,<dbl>,<dbl>,<fct>
5.1,3.5,1.4,0.2,setosa
4.9,3.0,1.4,0.2,setosa
4.7,3.2,1.3,0.2,setosa
4.6,3.1,1.5,0.2,setosa
5.0,3.6,1.4,0.2,setosa
5.4,3.9,1.7,0.4,setosa
4.6,3.4,1.4,0.3,setosa
5.0,3.4,1.5,0.2,setosa
4.4,2.9,1.4,0.2,setosa
4.9,3.1,1.5,0.1,setosa


In [None]:
install.packages("caret")
library(caret)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘listenv’, ‘parallelly’, ‘future’, ‘globals’, ‘shape’, ‘future.apply’, ‘numDeriv’, ‘progressr’, ‘SQUAREM’, ‘diagram’, ‘lava’, ‘prodlim’, ‘proxy’, ‘iterators’, ‘clock’, ‘gower’, ‘hardhat’, ‘ipred’, ‘timeDate’, ‘e1071’, ‘foreach’, ‘ModelMetrics’, ‘plyr’, ‘pROC’, ‘recipes’, ‘reshape2’


Loading required package: ggplot2

Loading required package: lattice



In [None]:
set.seed(123)
trainIndex <- createDataPartition(data$Species, p = 0.8, list = FALSE)
trainIndex


Resample1
3
4
5
7
8
9
10
11
12
13


## **1. Handling Missing Values**

### ***Categorical Data:***
- **a. Load a dataset with categorical variables and introduce some missing values. How would you handle these missing values by:**
  - **Replacing them with the mode of the variable?**

    To replace missing values with the mode (most frequent category) in a categorical variable, you can use the following code:

    ```r
    # Sample dataset with categorical variables
    data <- data.frame(
      Category = sample(c("A", "B", "C", NA), 100, replace = TRUE)
    )

    # Find the mode (most frequent category)
    mode_value <- names(sort(table(data$Category), decreasing = TRUE))[1]

    # Replace NA with mode
    data$Category[is.na(data$Category)] <- mode_value
    ```

  - **Replacing them with a new category, such as "Unknown"?**

    To replace missing values in a categorical variable by assigning a new category (e.g., "Unknown"):

    ```r
    # Replace NA with "Unknown"
    data$Category[is.na(data$Category)] <- "Unknown"
    ```

- **b. How would you use the vtreat package to create a treatment plan for handling missing values in categorical data?**

    The `vtreat` package can automatically handle missing values. The `design_missingness_treatment()` function creates a treatment plan that can be applied to the dataset. Here's how you could use it:

    ```r
    library(vtreat)
    
    # Sample dataset with missing categorical values
    data <- data.frame(
      Category = sample(c("A", "B", "C", NA), 100, replace = TRUE)
    )

    # Create treatment plan
    treat_plan <- design_missingness_treatment(data, varlist = c("Category"))

    # Apply treatment plan to handle missing values
    treated_data <- prepare(treat_plan, data)
    ```

### ***Numerical Data:***
- **c. Load a dataset with numerical variables and introduce some missing values. How would you handle these missing values by:**
  - **Replacing them with the mean of the variable?**

    In numerical data, you can replace missing values with the mean of the variable:

    ```r
    # Sample dataset with missing numerical values
    data <- data.frame(
      Value = sample(c(10:100, NA), 100, replace = TRUE)
    )

    # Replace NA with mean
    data$Value[is.na(data$Value)] <- mean(data$Value, na.rm = TRUE)
    ```

  - **Replacing them with the median of the variable?**

    To replace missing values with the median of the variable:

    ```r
    # Replace NA with median
    data$Value[is.na(data$Value)] <- median(data$Value, na.rm = TRUE)
    ```




## **2. Log Transformation**

- **Load a dataset with a skewed numerical variable. How would you apply a log transformation to reduce skewness?**

    You can apply a log transformation using the `log()` function:

    ```r
    # Sample dataset with skewed numerical variable
    data <- data.frame(val = c(1, 5, 10, 50, 100, 500, 1000))

    # Log transformation
    data$log_val <- log(data$val)
    ```

- **After applying the log transformation, how would you check if the skewness has been reduced?**

    Check the skewness using `summary()` or a histogram:

    ```r
    # Check skewness
    summary(data$log_val)

    # Histogram to visualize distribution
    hist(data$log_val)
    ```





## **3. Sampling**

- **Load a dataset and perform simple random sampling to select 70% of the data for training and 30% for testing. How would you do this in R?**

    You can use `sample()` for random sampling:

    ```r
    set.seed(123)
    data <- data.frame(val = rnorm(100))  # Example dataset

    # 70% training data
    train_idx <- sample(1:nrow(data), 0.7 * nrow(data))
    train <- data[train_idx, ]
    test <- data[-train_idx, ]
    ```

- **How would you perform stratified sampling based on a categorical variable to ensure each category is proportionately represented in the sample?**

    You can use the `caret` package for stratified sampling:

    ```r
    library(caret)
    set.seed(123)

    # Stratified sampling based on a categorical variable (Species in iris dataset)
    train_idx <- createDataPartition(iris$Species, p = 0.7, list = FALSE)
    train <- iris[train_idx, ]
    test <- iris[-train_idx, ]
    ```




## **4. Normalization**

- **Load a dataset with numerical variables. How would you normalize these variables to a range of [0, 1]?**

    You can create a custom normalization function:

    ```r
    # Sample dataset
    data <- mtcars

    # Normalize function
    normalize <- function(x) (x - min(x)) / (max(x) - min(x))

    # Apply normalization
    norm_data <- as.data.frame(lapply(data, normalize))
    ```

- **After normalizing the variables, how would you verify that the transformed variables are within the [0, 1] range?**

    You can check using `summary()`:

    ```r
    # Verify normalization
    summary(norm_data)
    ```




## **5. Standardization**

- **Load a dataset with numerical variables. How would you standardize these variables to have a mean of 0 and a standard deviation of 1?**

    You can use the `scale()` function in R:

    ```r
    # Sample dataset
    data <- mtcars

    # Standardize the data
    std_data <- scale(data)
    ```

- **After standardizing the variables, how would you check if the mean and standard deviation are as expected (mean = 0, sd = 1)?**

    You can check the mean and standard deviation using `colMeans()` and `apply()`:

    ```r
    # Check mean
    colMeans(std_data)

    # Check standard deviation
    apply(std_data, 2, sd)
    ```

