This repository has been archived by the owner on Mar 7, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #9 from CatalystCode/more-sentiments
Add sentiment analysis capability for 68 more languages
- Loading branch information
Showing
19 changed files
with
453 additions
and
116 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
6 changes: 3 additions & 3 deletions
6
...s/spark/transforms/locations/Logger.scala → ...alyst/fortis/spark/logging/Loggable.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
62 changes: 62 additions & 0 deletions
62
...ala/com/microsoft/partnercatalyst/fortis/spark/transforms/locations/PlaceRecognizer.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
package com.microsoft.partnercatalyst.fortis.spark.transforms.locations | ||
|
||
import java.io.{IOError, IOException} | ||
|
||
import com.microsoft.partnercatalyst.fortis.spark.logging.Loggable | ||
import com.microsoft.partnercatalyst.fortis.spark.transforms.ZipModelsProvider | ||
import com.microsoft.partnercatalyst.fortis.spark.transforms.nlp.OpeNER | ||
import ixa.kaflib.Entity | ||
|
||
import scala.collection.JavaConversions._ | ||
import scala.util.{Failure, Success, Try} | ||
|
||
@SerialVersionUID(100L) | ||
class PlaceRecognizer( | ||
modelsSource: Option[String] = None, | ||
enabledLanguages: Set[String] = Set("de", "en", "es", "eu", "it", "nl") | ||
) extends Serializable with Loggable { | ||
|
||
@volatile private lazy val modelsProvider = createModelsProvider() | ||
|
||
def extractPlaces(text: String, language: String): Set[String] = { | ||
if (!enabledLanguages.contains(language)) { | ||
return Set() | ||
} | ||
|
||
Try(modelsProvider.ensureModelsAreDownloaded(language)) match { | ||
case Failure(ex) => | ||
logError(s"Unable to load models for language $language", ex) | ||
Set() | ||
|
||
case Success(resourcesDirectory) => | ||
extractPlacesUsingModels(text, language, resourcesDirectory) | ||
} | ||
} | ||
|
||
private def extractPlacesUsingModels(text: String, language: String, resourcesDirectory: String): Set[String] = { | ||
try { | ||
val kaf = OpeNER.tokAnnotate(resourcesDirectory, text, language) | ||
OpeNER.posAnnotate(resourcesDirectory, language, kaf) | ||
OpeNER.nerAnnotate(resourcesDirectory, language, kaf) | ||
|
||
logDebug(s"Analyzed text $text in language $language: $kaf") | ||
|
||
kaf.getEntities.toList.filter(entityIsPlace).map(_.getStr).toSet | ||
} catch { | ||
case ex @ (_ : NullPointerException | _ : IOError | _ : IOException) => | ||
logError(s"Unable to extract places for language $language", ex) | ||
Set() | ||
} | ||
} | ||
|
||
private def entityIsPlace(entity: Entity) = { | ||
val entityType = Option(entity.getType).getOrElse("").toLowerCase | ||
entityType == "location" || entityType == "gpe" | ||
} | ||
|
||
protected def createModelsProvider(): ZipModelsProvider = { | ||
new ZipModelsProvider( | ||
language => s"https://fortismodels.blob.core.windows.net/public/opener-$language.zip", | ||
modelsSource) | ||
} | ||
} |
5 changes: 3 additions & 2 deletions
5
...osoft/partnercatalyst/fortis/spark/transforms/locations/client/FeatureServiceClient.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2 changes: 1 addition & 1 deletion
2
...ark/transforms/locations/nlp/OpeNER.scala → .../fortis/spark/transforms/nlp/OpeNER.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
9 changes: 9 additions & 0 deletions
9
src/main/scala/com/microsoft/partnercatalyst/fortis/spark/transforms/nlp/Tokenizer.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
package com.microsoft.partnercatalyst.fortis.spark.transforms.nlp | ||
|
||
object Tokenizer { | ||
@transient private lazy val wordTokenizer = """\b""".r | ||
|
||
def apply(sentence: String): Seq[String] = { | ||
wordTokenizer.split(sentence).toSeq | ||
} | ||
} |
49 changes: 49 additions & 0 deletions
49
...artnercatalyst/fortis/spark/transforms/sentiment/CognitiveServicesSentimentDetector.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
package com.microsoft.partnercatalyst.fortis.spark.transforms.sentiment | ||
|
||
import net.liftweb.json | ||
|
||
import scalaj.http.Http | ||
|
||
case class SentimentDetectorAuth(key: String, apiHost: String = "westus.api.cognitive.microsoft.com") | ||
|
||
@SerialVersionUID(100L) | ||
class CognitiveServicesSentimentDetector( | ||
auth: SentimentDetectorAuth | ||
) extends DetectsSentiment { | ||
|
||
def detectSentiment(text: String, language: String): Option[Double] = { | ||
val textId = "0" | ||
val requestBody = buildRequestBody(text, textId, language) | ||
val response = callCognitiveServices(requestBody) | ||
parseResponse(response, textId) | ||
} | ||
|
||
protected def callCognitiveServices(requestBody: String): String = { | ||
Http(s"https://${auth.apiHost}/text/analytics/v2.0/sentiment") | ||
.headers( | ||
"Content-Type" -> "application/json", | ||
"Ocp-Apim-Subscription-Key" -> auth.key) | ||
.postData(requestBody) | ||
.asString | ||
.body | ||
} | ||
|
||
protected def buildRequestBody(text: String, textId: String, language: String): String = { | ||
implicit val formats = json.DefaultFormats | ||
val requestBody = dto.JsonSentimentDetectionRequest(documents = List(dto.JsonSentimentDetectionRequestItem( | ||
id = textId, | ||
language = language, | ||
text = text))) | ||
json.compactRender(json.Extraction.decompose(requestBody)) | ||
} | ||
|
||
protected def parseResponse(apiResponse: String, textId: String): Option[Double] = { | ||
implicit val formats = json.DefaultFormats | ||
val response = json.parse(apiResponse).extract[dto.JsonSentimentDetectionResponse] | ||
if (response.errors.exists(_.id == textId)) { | ||
None | ||
} else { | ||
response.documents.find(_.id == textId).map(_.score) | ||
} | ||
} | ||
} |
62 changes: 28 additions & 34 deletions
62
...a/com/microsoft/partnercatalyst/fortis/spark/transforms/sentiment/SentimentDetector.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,49 +1,43 @@ | ||
package com.microsoft.partnercatalyst.fortis.spark.transforms.sentiment | ||
|
||
import net.liftweb.json | ||
import com.microsoft.partnercatalyst.fortis.spark.logging.Loggable | ||
|
||
import scalaj.http.Http | ||
|
||
case class SentimentDetectorAuth(key: String, apiHost: String = "westus.api.cognitive.microsoft.com") | ||
import scala.util.{Failure, Success, Try} | ||
|
||
@SerialVersionUID(100L) | ||
class SentimentDetector( | ||
auth: SentimentDetectorAuth | ||
) extends Serializable { | ||
) extends DetectsSentiment { | ||
|
||
private lazy val detectors = initializeDetectors() | ||
|
||
def detectSentiment(text: String, language: String): Option[Double] = { | ||
val textId = "0" | ||
val requestBody = buildRequestBody(text, textId, language) | ||
val response = callCognitiveServices(requestBody) | ||
parseResponse(response, textId) | ||
detectors.view.map(detector => { | ||
Try(detector.detectSentiment(text, language)) match { | ||
case Success(Some(sentimentScore)) => | ||
logDebug(s"Computed sentiment via ${detector.getClass}") | ||
Some(sentimentScore) | ||
case Success(None) | Failure(_) => | ||
logDebug(s"Unable to compute sentiment via ${detector.getClass}") | ||
None | ||
} | ||
}) | ||
.find(_.isDefined) | ||
.getOrElse(None) | ||
} | ||
|
||
protected def callCognitiveServices(requestBody: String): String = { | ||
Http(s"https://${auth.apiHost}/text/analytics/v2.0/sentiment") | ||
.headers( | ||
"Content-Type" -> "application/json", | ||
"Ocp-Apim-Subscription-Key" -> auth.key) | ||
.postData(requestBody) | ||
.asString | ||
.body | ||
protected def initializeDetectors(): Seq[DetectsSentiment] = { | ||
Seq(new CognitiveServicesSentimentDetector(auth), | ||
new WordListSentimentDetector()) | ||
} | ||
} | ||
|
||
protected def buildRequestBody(text: String, textId: String, language: String): String = { | ||
implicit val formats = json.DefaultFormats | ||
val requestBody = dto.JsonSentimentDetectionRequest(documents = List(dto.JsonSentimentDetectionRequestItem( | ||
id = textId, | ||
language = language, | ||
text = text))) | ||
json.compactRender(json.Extraction.decompose(requestBody)) | ||
} | ||
object SentimentDetector { | ||
val Positive: Double = 1.0 | ||
val Neutral: Double = 0.6 | ||
val Negative: Double = 0.0 | ||
} | ||
|
||
protected def parseResponse(apiResponse: String, textId: String): Option[Double] = { | ||
implicit val formats = json.DefaultFormats | ||
val response = json.parse(apiResponse).extract[dto.JsonSentimentDetectionResponse] | ||
if (response.errors.exists(_.id == textId)) { | ||
None | ||
} else { | ||
response.documents.find(_.id == textId).map(_.score) | ||
} | ||
} | ||
trait DetectsSentiment extends Serializable with Loggable { | ||
def detectSentiment(text: String, language: String): Option[Double] | ||
} |
Oops, something went wrong.