# Part 1) RDDs

## Data Loading

In [None]:
// Loading the data from HDFS
// Converting the json file intto a dataframe
val Path= "hdfs:///user/dic24_shared/amazon-reviews/full/reviews_devset.json"
val data = spark.read.json(Path)
// Making the RDD 
// Category is the Key 
// reviewText is the value
val pairs = data.rdd.map{row=>
    val category = row.getAs[String] ("category")
    val text = row.getAs[String] ("reviewText")
    (category,text)}
// 
// Counting the total number of reviews
val N = pairs.count().toInt
// Reading stopWord into RDD string and convert it to an array
val stopwordsPath = "Exercise2/stopwords.txt"
val stopwords = sc.textFile(stopwordsPath).collect()

Intitializing Scala interpreter ...

## Data Cleaning

In [None]:
// converting the values(reviewText) to lowercase
//spliting the reviewTexts into words and removing any non-alphabetic characters
//removing stopWords from ReviewTexts
val clean_reviews = pairs.mapValues(value=>value.toLowerCase
                                  .split("[^a-zA-Z]+")
                                  .filterNot(x => stopwords.contains(x.toLowerCase)).mkString(" ")).cache()

Intitializing Scala interpreter ...

## Chi_2 Value Calculating

In [None]:
// Calculate a number of unique words in the whole reviews. The result is a tuple (word, frequency of word )
// first split the words ,and then we give each word value one an sum them for each word with reduce 
val countWords = clean_reviews.map{ case (key, value) =>  value }.flatMap (x=>x.split(" ")).map(word => (word, 1)).reduceByKey ((x,y)=>x+y)
// Calculate a frequency of each category
// we give each key value one and sum them for each category them with reduce
val countCategory = clean_reviews.map{ case (key, value) =>  (key,1)}.reduceByKey ((x,y)=>x+y)

//counting number of each words per category
//we give each word in each category value one and sum them with reduce for each category,word as the key
val categoryTermCount = clean_reviews.flatMapValues(x=>x.split(" ")).map(word => (word, 1)).reduceByKey ((x,y)=>x+y)
// making a tuple with key value of category and value of (word,number of that word in the respective category)
val result_A = categoryTermCount.map{case(k,v) => ( (k._2),( k._1, v))}


Intitializing Scala interpreter ...

In [None]:
// Do the join datasets per word column
// tuple:(word,((category,A),countWords))
val rddJoin = result_A.join(countWords)
// counting B , The data now represents a tuple (category (word, A,B))
val result_A_B = rddJoin.map{case (k,v)=> (v._1._1, (k, v._1._2, v._2-v._1._2)) }

Intitializing Scala interpreter ...

In [None]:
// Join per category
// tapule"(category,((word,A,B),countCategory)))
val rddJoin2 = result_A_B.join(countCategory).persist
print(rddJoin2)

Intitializing Scala interpreter ...

In [None]:
// Calculate chi_2 value using A,B,C,D,N
val chi2 = rddJoin2.map{case (k,v)=> {
    val A = v._1._2.toFloat
    val B = v._1._3.toFloat
    val C = v._2-A
    val D = N - A - B - C
    val result = (N*(A*D-B*C)*(A*D-B*C))/((A+B)*(A+C)*(B+D)*(C+D))
    (k, (result,v._1._1))
}}
// Group the lines according to the key (=category) and sort according to the value of chi_2
val grouped = chi2.groupByKey().mapValues(tuple => tuple.toList.sortBy(-_._1))
// Extract the first 75 values in each category
val grouped_75 = grouped.mapValues(line=>line.take(75)).sortByKey()

Intitializing Scala interpreter ...

## Export

In [None]:
import scala.reflect.io.File

// Transforming and saving the output like the previous exercise
val output = grouped_75.map { case (category, terms) =>
  val formattedTerms = terms.map { case (term, chi2) =>
    s"$chi2:$term"
  }.mkString(" ")

  s"<$category> $formattedTerms"
}

// Save the output
val file = File("output_rdd.txt")
file.writeAll(output.collect().mkString("\n"))

// Extract terms and sort them alphabetically
val terms = grouped_75.flatMap { case (category, terms) =>
  terms.map { case (term, chi2) => chi2 }
}.distinct().collect().sorted

// Append the sorted terms to the file
file.appendAll("\n" + terms.mkString(" "))

Intitializing Scala interpreter ...