Skip to content

BruceBuurstra/MovieDataCleaner

Repository files navigation

MovieDataCleaner

MovieDataCleaner is an R package for cleaning missing datasets with NMF imputation and kNN imputation

See pkgdown site here: https://brucebuurstra.github.io/MovieDataCleaner/

MovieDataCleaner easily:

  • drops rows and columns containing missing values
  • imputes missing values using NMF or kNN
  • clusters imputed datasets

Installation

You can install the released version of MovieDataCleaner from GitHub with:

devtools::install_github("BruceBuurstra/MovieDataCleaner")

kNN Imputation

Below we have a dataset of movie ratings with a lot of missing values.

library(MovieDataCleaner)
load("C:/Users/bbuur/MovieDataCleaner/data/new_data.rdata")
head(new_data, c(5,5))
#>     1   2  6 10 11
#> 1 4.0  NA  4 NA NA
#> 2 4.5  NA NA NA NA
#> 3 3.5 3.0  4 NA NA
#> 4 4.0 3.0 NA  2 NA
#> 5 3.5 3.5 NA  5 NA

The run_knn_imputation imputes values into a dataset where there was missing values.

knn_imputed <- run_knn_imputation(5, new_data)
head(knn_imputed, c(5,5))
#>        1   2        6       10    11
#> [1,] 4.0 5.0 4.000000 5.000000 4.000
#> [2,] 4.5 4.0 3.928571 2.750000 2.000
#> [3,] 3.5 3.0 4.000000 4.166667 3.625
#> [4,] 4.0 3.0 4.000000 2.000000 3.500
#> [5,] 3.5 3.5 3.750000 5.000000 3.500

We can also use round_to_movie_ratings to make sure every value is rounded to a movie rating.

knn_round <- sapply(knn_imputed, round_to_movie_rating)
ratings_knn_imputed <- matrix(unlist(knn_round),ncol = ncol(new_data), byrow = FALSE)
head(ratings_knn_imputed, c(5,5))
#>      [,1] [,2] [,3] [,4] [,5]
#> [1,]  4.0  5.0    4    5  4.0
#> [2,]  4.5  4.0    4    3  2.0
#> [3,]  3.5  3.0    4    4  3.5
#> [4,]  4.0  3.0    4    2  3.5
#> [5,]  3.5  3.5    4    5  3.5

Below is the function run_knn_imputation that was written as well as the documentation.

#' run_knn_imputation
#'
#' `run_knn_imputation` imputes the missing values into a dataset using knn
#'
#' @param n the k amount of nearest neighbors to use
#' @param data A dataset to run knn on
#' @return Returns the dataset using knn to impute missing values
#' @examples
#'
#' run_knn_imputation(5, new_data) # returns our movie ratings dataset with missing values imputed from using knn of 5
#'
#' @export

# Imputing missing values using knn
run_knn_imputation <- function(n, data) {
  data_matrix <- data
  # set missing values to 0
  data_matrix[is.na(data_matrix)] <- 0
  # convert data to data matrix
  data_matrix <- data.matrix(data_matrix)
  # create empty matrix to add euclidean distances to, square matrix with dimensions of the amount of columns from data
  euclidean_matrix <- matrix(nrow = ncol(data_matrix),
                             ncol = ncol(data_matrix))

  for (i in 1:ncol(data_matrix)) {
    for (j in 1:ncol(data_matrix)) {
      # find values from 2 different columns that are not 0
      i1 <- which(data_matrix[, i] != 0)
      i2 <- which(data_matrix[, j] != 0)
      # find where columns both have a non zero value
      indx <- intersect(i1, i2)
      # find values where columns both have a non zero value
      int1 <- data_matrix[indx, i]
      int2 <- data_matrix[indx, j]
      # find euclidean distance between columns and add to matrix
      euc <- sqrt(sum(int1 - int2)^2)
      euclidean_matrix[i, j] <- euc
      #print(i)
      #print(j)
    }
    assign("euclidean_matrix", euclidean_matrix, envir = .GlobalEnv)
  }

  knn_imputed <- data_matrix
  for (i in 1:ncol(knn_imputed)){
    # distances not including zeros on diagonal
    dist <- euclidean_matrix[-i,i]
    # sort distances
    distances <- sort(dist)[1:n]
    # find indices of where smallest distances were
    neighbor_ind = which(dist %in% sort(dist)[1:n])
    # return list of indices and distances
    ret <-  list(neighbor_ind, distances)
    # get only indices
    list <- c(unlist(ret[1]))
    #print(i)
    for (j in 1:nrow(knn_imputed)) {
      if (knn_imputed[j, i] == 0) {
        # if value is 0 find values of nearest neighbors for column
        knn_values <- data_matrix[j, list]
        # get non zero values
        knn_non_zero <- which(knn_values != 0)
        knn_non_zero_values <- knn_values[knn_non_zero]
        if (length(knn_non_zero_values) == 0) {
          # if all nearest neighbors are 0 keep missing value as 0
          knn_imputed[j, i] <- 0
        } else {
          # find average of k nearest neighbor values
          avg_knn_values <- sum(knn_non_zero_values) /
            length(knn_non_zero_values)
          # add value to knn_imputed matrix
          knn_imputed[j, i] <- avg_knn_values
        }
      } else {
        #print(j)
      }
    }
  }
  # return knn_imputed matrix
  assign("knn_imputed", knn_imputed, envir = .GlobalEnv)
  return(knn_imputed)
}

About

Cleaning Datasets with KNN and NMF Imputation

Resources

License

Unknown, MIT licenses found

Licenses found

Unknown
LICENSE
MIT
LICENSE.md

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published

Languages