In [26]:
import pandas as pd
import numpy as np
import pyspark as py
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

spark = SparkSession.builder \
 .master('local[*]') \
 .appName('Spark NLP') \
 .config('spark.driver.memory', '6g') \
 .config('spark.executor.memory', '6g') \
 .config('spark.jars.packages', 'JohnSnowLabs:spark-nlp:2.5.2') \
 .config('spark.kryoserializer.buffer.max', '600M') \
 .getOrCreate()
   
sc = spark.sparkContext
sqlContext = SQLContext(sc)

df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true', delimiter=";").load('demo.csv')

In [27]:
import translate
from importlib import reload
reload(translate)

translated = translate.Translate(df, text_column = "Comments")
comments = translated.get_dataframe()

In [28]:
comments.show(20, False)

+---------------------------------------------------------------------------------------------------------------------------------------------------+
|Comments                                                                                                                                           |
+---------------------------------------------------------------------------------------------------------------------------------------------------+
|treated with ampiclox                                                                                                                              |
|ketosis                                                                                                                                            |
|not treated                                                                                                                                        |
|abomasum to the left                                                                               

In [29]:
import preprocess

preprocessed = preprocess.Preprocess(comments)
prep_df = preprocessed.get_dataframe()

In [30]:
import classify

lda_category_n8w7p250_lookup = {
    0: [1],
    1: [3, 8],
    2: [9, 12],
    3: [11, 12],
    4: [4, 7],
    5: [13],
    6: [8, 5, 6],
    7: [2, 4, 7]
}

classified = classify.LDAClassification(prep_df, lda_category_n8w7p250_lookup, topics=8, passes=250, decay=None, iterations=None)
classified.get_dataframe().show()

+--------------------+---------------+
|            Comments| classification|
+--------------------+---------------+
|      treat ampiclox|         [4, 7]|
|ketosi ketosi be ...|      [8, 5, 6]|
|               treat|         [4, 7]|
|abomasum leav the...|      [8, 5, 6]|
|heifer calf cattl...|   [1, 8, 5, 6]|
|calcium deposit l...|      [8, 5, 6]|
|coli mastiti bovi...|     [4, 7, 13]|
|tag photo exampl ...|            [1]|
|  keep well antibiot|  [1, 3, 8, 13]|
|la mastiti ubrole...|     [4, 7, 13]|
|mastiti rv ubrole...|     [4, 7, 13]|
|         much better|           [13]|
|dead heifer calf ...|            [1]|
|control metriti m...|[4, 7, 2, 4, 7]|
|treat anoth three...|         [4, 7]|
|temperatur c prop...|      [8, 5, 6]|
|la ubrolexin use ...|           [13]|
|     calv strong alf|            [1]|
|diarrhea problem ...|      [8, 5, 6]|
|correct saskia bl...|      [8, 5, 6]|
+--------------------+---------------+



In [31]:
llda_lookup = {
    "Calving problems": [1,2,3,8],
    "Uterus problems": [2,4,7],
    "Mastitis": [13],
    "Intestine problems": [5,6,10],
    "Peritonitis": [11, 12],
    "Pneumonia": [9],
    "Nothing": []
}

classified = classify.LLDAClassification(prep_df, llda_lookup, topics = 7, passes = 250)
classified.get_dataframe().show()

+--------------------+--------------+
|            Comments|classification|
+--------------------+--------------+
|      treat ampiclox|            []|
|ketosi ketosi be ...|  [1, 2, 3, 8]|
|               treat|            []|
|abomasum leav the...|    [5, 6, 10]|
|heifer calf cattl...|            []|
|calcium deposit l...|  [1, 2, 3, 8]|
|coli mastiti bovi...|          [13]|
|tag photo exampl ...|            []|
|  keep well antibiot|            []|
|la mastiti ubrole...|          [13]|
|mastiti rv ubrole...|          [13]|
|         much better|            []|
|dead heifer calf ...|            []|
|control metriti m...|     [2, 4, 7]|
|treat anoth three...|            []|
|temperatur c prop...|  [1, 2, 3, 8]|
|la ubrolexin use ...|          [13]|
|     calv strong alf|            []|
|diarrhea problem ...|  [1, 2, 3, 8]|
|correct saskia bl...|            []|
+--------------------+--------------+

