-
Notifications
You must be signed in to change notification settings - Fork 4
Publish AnalyzedItems to Cassandra #28
Changes from 14 commits
bbd3053
3c84f12
7582e35
c75e93f
4e30b78
d6ff036
0808459
1a2d9bf
f1a5ad4
3738559
629e5d3
fb867d9
e736898
f34f55a
37c1983
a5ed3f5
32d0701
a46c701
8162fe3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
package com.microsoft.partnercatalyst.fortis.spark.pipeline | ||
|
||
import java.time.Instant.now | ||
import java.util.UUID.randomUUID | ||
|
||
import com.microsoft.partnercatalyst.fortis.spark.dto.{Analysis, AnalyzedItem} | ||
import com.microsoft.partnercatalyst.fortis.spark.streamprovider.{ConnectorConfig, StreamProvider} | ||
|
@@ -17,6 +18,7 @@ object RadioPipeline extends Pipeline { | |
|
||
private def convertToSchema(stream: DStream[RadioTranscription], transformContext: TransformContext): DStream[AnalyzedItem] = { | ||
stream.map(transcription => AnalyzedItem( | ||
id = randomUUID(), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be called via the uuid() function in cassandra. |
||
createdAtEpoch = now.getEpochSecond, | ||
body = transcription.text, | ||
title = "", | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
package com.microsoft.partnercatalyst.fortis.spark.pipeline | ||
|
||
import java.time.Instant.now | ||
import java.util.UUID.randomUUID | ||
|
||
import com.microsoft.partnercatalyst.fortis.spark.dto.{Analysis, AnalyzedItem} | ||
import com.microsoft.partnercatalyst.fortis.spark.streamprovider.{ConnectorConfig, StreamProvider} | ||
|
@@ -19,6 +20,7 @@ object TadawebPipeline extends Pipeline { | |
import transformContext._ | ||
|
||
stream.map(tada => AnalyzedItem( | ||
id = randomUUID(), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be called via the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The advantage of creating the id early is that we have a way to track every event through the pipeline (e.g. useful when logging). Is this benefit worth explicitly creating the UUID? |
||
createdAtEpoch = now.getEpochSecond, | ||
body = tada.text, | ||
title = tada.title, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
package com.microsoft.partnercatalyst.fortis.spark.sinks.cassandra | ||
|
||
import scala.util.Properties.envOrElse | ||
|
||
import org.apache.spark.SparkConf | ||
import org.apache.spark.streaming.Duration | ||
|
||
object CassandraConfig { | ||
def init(conf: SparkConf, batchDuration: Duration): SparkConf = { | ||
conf.setIfMissing("spark.cassandra.connection.host", envOrElse("FORTIS_CASSANDRA_HOST", "")) | ||
.setIfMissing("spark.cassandra.auth.username", envOrElse("FORTIS_CASSANDRA_USER", "")) | ||
.setIfMissing("spark.cassandra.auth.password", envOrElse("FORTIS_CASSANDRA_PASSWORD", "")) | ||
.setIfMissing("spark.cassandra.connection.keep_alive_ms", (batchDuration.milliseconds * 2).toString) | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
package com.microsoft.partnercatalyst.fortis.spark.sinks.cassandra | ||
|
||
import java.time.Instant.now | ||
import java.util.UUID | ||
|
||
import com.microsoft.partnercatalyst.fortis.spark.dto.AnalyzedItem | ||
import com.microsoft.partnercatalyst.fortis.spark.sinks.cassandra.Utils.{mean, rescale} | ||
import com.microsoft.partnercatalyst.fortis.spark.transforms.gender.GenderDetector.{Female, Male} | ||
import com.microsoft.partnercatalyst.fortis.spark.transforms.sentiment.SentimentDetector.Neutral | ||
|
||
case class Sentiment( | ||
pos_avg: Float, | ||
neg_avg: Float) | ||
|
||
case class Gender( | ||
male_mentions: Int, | ||
female_mentions: Int) | ||
|
||
case class Entities( | ||
name: String, | ||
externalsource: String, | ||
externalrefid: String, | ||
count: Float) | ||
|
||
case class Features( | ||
mentions: Int, | ||
sentiment: Sentiment, | ||
gender: Gender, | ||
entities: Set[Entities]) | ||
|
||
case class Event( | ||
pipeline: String, | ||
externalid: String, | ||
computedfeatures: Features, | ||
detectedkeywords: Set[String], | ||
detectedplaceids: Set[String], | ||
event_time: Long, | ||
eventlangcode: String, | ||
id: UUID, | ||
insertion_time: Long, | ||
messagebody: String, | ||
sourceid: String, | ||
sourceurl: String, | ||
title: String) | ||
|
||
object CassandraSchema { | ||
def apply(item: AnalyzedItem): Event = { | ||
Event( | ||
pipeline = item.publisher, | ||
externalid = "", // todo | ||
computedfeatures = getFeature(item), | ||
detectedkeywords = item.analysis.keywords.map(_.name).toSet, | ||
detectedplaceids = item.analysis.locations.map(_.wofId).toSet, | ||
event_time = item.createdAtEpoch, | ||
eventlangcode = item.analysis.language.orNull, | ||
id = item.id, | ||
insertion_time = now.getEpochSecond, | ||
messagebody = item.body, | ||
sourceid = "", // todo | ||
sourceurl = item.sourceUrl, | ||
title = item.title) | ||
} | ||
|
||
private def getFeature(item: AnalyzedItem): Features = { | ||
val genderCounts = item.analysis.genders.map(_.name).groupBy(identity).mapValues(_.size) | ||
val entityCounts = item.analysis.entities.map(_.name).groupBy(identity).mapValues(_.size) | ||
val positiveSentiments = item.analysis.sentiments.filter(_ > Neutral) | ||
val negativeSentiments = item.analysis.sentiments.filter(_ < Neutral) | ||
Features( | ||
mentions = -1, | ||
sentiment = Sentiment( | ||
pos_avg = if (positiveSentiments.nonEmpty) mean(rescale(positiveSentiments, 0, 1)).toFloat else -1, | ||
neg_avg = if (negativeSentiments.nonEmpty) mean(rescale(negativeSentiments, 0, 1)).toFloat else -1), | ||
gender = Gender( | ||
male_mentions = genderCounts.getOrElse(Male, -1), | ||
female_mentions = genderCounts.getOrElse(Female, -1)), | ||
entities = entityCounts.map(kv => Entities( | ||
name = kv._1, | ||
count = kv._2, | ||
externalsource = "", // todo | ||
externalrefid = "" // todo | ||
)).toSet) | ||
} | ||
} | ||
|
||
object Utils { | ||
def mean(items: List[Double]): Double = { | ||
items.sum / items.length | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What if the length of items is 0? Wouldn't this cause an error? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Improved error handling in 37c1983. |
||
} | ||
|
||
/** @see https://stats.stackexchange.com/a/25897 */ | ||
def rescale(items: List[Double], min_new: Double, max_new: Double): List[Double] = { | ||
val min_old = items.min | ||
val max_old = items.max | ||
val coef = (max_new - min_new) / (max_old - min_old) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If max_old == min_old, the denominator would be 0. Maybe put a check for this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added error handling in 37c1983. |
||
items.map(v => coef * (v - max_old) + max_new) | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
package com.microsoft.partnercatalyst.fortis.spark.sinks.cassandra | ||
|
||
import com.microsoft.partnercatalyst.fortis.spark.dto.AnalyzedItem | ||
import org.apache.spark.streaming.dstream.DStream | ||
import com.datastax.spark.connector.streaming._ | ||
|
||
object CassandraSink { | ||
def apply(dstream: Option[DStream[AnalyzedItem]], keyspaceName: String, tableName: String): Unit = { | ||
if (dstream.isDefined) { | ||
dstream.get.map(CassandraSchema(_)).saveToCassandra(keyspaceName, tableName) | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
package com.microsoft.partnercatalyst.fortis.spark.transforms.gender | ||
|
||
object GenderDetector { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. May I suggest adding There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sweet. Done in 8162fe3. |
||
val Male = "M" | ||
val Female = "F" | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should be called via the uuid() function in cassandra.