# Consumer 2

## Set Up

In [47]:
from pyspark.sql import SparkSession


# Spark session & context
spark = (SparkSession
         .builder
         .master('local')
         .appName('json-changes-event-consumer')  ##application name  for cluster 
         # Add kafka package
         .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2") ##purpose of these 
         .getOrCreate())
sc = spark.sparkContext

In [48]:
# Create stream dataframe setting kafka server, topic and offset option
df = (spark
  .readStream
  .format("kafka")
  .option("kafka.bootstrap.servers", "185.185.126.143:9092") # kafka server
  .option("subscribe", "ML_Topic") # topic name matching the producer 
  .option("startingOffsets", "latest") # start from beginning select  "latest" or earliest
  .load())

In [49]:
from pyspark.sql.types import StringType

# Convert binary to string key and value
df1 = (df
    .withColumn("key", df["key"].cast(StringType()))
    .withColumn("value", df["value"].cast(StringType())))

In [50]:
#Comma seprated is dangerous because text column has many commas. 
df2=df1\
      .selectExpr("split(value,',')[0] as review_id" \
                  ,"split(value,',')[1] as text")

-------------------------------------------
Batch: 34
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"JVxmVxL9SFRsqyuN...|Ordered through Y...|
+--------------------+--------------------+

-------------------------------------------
Batch: 2344
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"JVxmVxL9SFRsqyuN...|Ordered through Y...|
+--------------------+--------------------+

-------------------------------------------
Batch: 2351
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"JVxmVxL9SFRsqyuN...|Ordered through Y...|
+--------------------+--------------------+



#clearing checkpoints 
spark.cleaner.referenceTracking.cleanCheckpoints  true

or 

rm -f . . ipynb_checkpoints/  

In [40]:
#check data flow 
df2.writeStream.outputMode("append") \
            .format("console") \
            .start()

-------------------------------------------
Batch: 2310
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"m4shKWOnjyzXBfu0...|This is my go to ...|
+--------------------+--------------------+



21/07/27 23:01:15 WARN StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-0c2d1692-5f4e-4a06-a2f7-889b6ebcc6f5. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.


<pyspark.sql.streaming.StreamingQuery at 0x7f0cd5450640>

-------------------------------------------
Batch: 0
-------------------------------------------
+---------+----+
|review_id|text|
+---------+----+
+---------+----+



# Model Components 

#pip install tensorflow

In [41]:
import os
import pickle
import pandas as pd
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf 

In [42]:
# Recreate the exact same model, including its weights and the optimizer
model = tf.keras.models.load_model('NLP_Comments_Classification_20200531.h5')

# Show the model architecture
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           80000     
_________________________________________________________________
bidirectional (Bidirectional (None, 64)                12544     
_________________________________________________________________
dense (Dense)                (None, 24)                1560      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 25        
Total params: 94,129
Trainable params: 94,129
Non-trainable params: 0
_________________________________________________________________


In [43]:
# loading
with open('tokenizer_X_20200531.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

with open('tokenizer_Y_20200531.pickle', 'rb') as handle:
    label_tokenizer = pickle.load(handle)

In [44]:
#Should output three fields: Probability , the outcome and the full message 
##list with three elements 

def modelevdh(textbody) : 
        phrase=list([textbody])
        x = tokenizer.texts_to_sequences(phrase)
        paddedx = pad_sequences(x, maxlen=120, padding='post', truncating='post')
        probability=model.predict_proba(paddedx).max(axis=1) 
       # print(phrase)
        if probability==0:
            my_prediction= "No Classification"
        elif probability<=.3:
            my_prediction="Accepted"
        elif probability>=.7:
            my_prediction="Reject"
        else:
            my_prediction="Neutral"
        #print(probability[0])  
        return( probability , my_prediction   ,my_prediction + "  --->  Probability of Reject is: " + "{0:.0%}".format(probability[0]))

-------------------------------------------
Batch: 1
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"rRQCXz5JXCAJXZqs...|My budgie was sic...|
+--------------------+--------------------+

-------------------------------------------
Batch: 2318
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"rRQCXz5JXCAJXZqs...|My budgie was sic...|
+--------------------+--------------------+

-------------------------------------------
Batch: 2311
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"rRQCXz5JXCAJXZqs...|My budgie was sic...|
+--------------------+--------------------+

----------------------------------------

## Stream Through ML Model 

In [None]:
#predicting the streaming kafka messages
from kafka import KafkaConsumer

consumer = KafkaConsumer('ML_Topic',bootstrap_servers=['185.185.126.143:9092'])

print("Starting ML predictions.")

for message in consumer:
    ModelOutcome = modelevdh(message.value.decode('utf-8')) 
    print(ModelOutcome[0])
    print(ModelOutcome[1])
 

Starting ML predictions.
-------------------------------------------
Batch: 20
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"22A9KhRpqenhDe3w...|We are driving ac...|
+--------------------+--------------------+

-------------------------------------------
Batch: 26
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"22A9KhRpqenhDe3w...|We are driving ac...|
+--------------------+--------------------+

-------------------------------------------
Batch: 2415
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"22A9KhRpqenhDe3w...|We are driving ac...|
+--------------------+--------------------+

----------------

[Stage 4991:>                                                       (0 + 1) / 1]                                                                                

-------------------------------------------
Batch: 102
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"tyQh1sE_hE_w_lS-...|This is my favori...|
+--------------------+--------------------+

81
[0.0024671]
Accepted
-------------------------------------------
Batch: 2420
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"uSQC_451BxmBoEDn...|Very bad customer...|
+--------------------+--------------------+

-------------------------------------------
Batch: 31
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"uSQC_451BxmBoEDn...|Very bad customer...|
+--------------------+--------------------+

----------------

[Stage 5003:> (0 + 1) / 1][Stage 5004:> (0 + 0) / 1][Stage 5005:> (0 + 0) / 1]                                                                                

-------------------------------------------
Batch: 27
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"foa4cvp0SBT6PcOn...|I stayed here las...|
+--------------------+--------------------+

-------------------------------------------
Batch: 2415
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"foa4cvp0SBT6PcOn...|I stayed here las...|
+--------------------+--------------------+

-------------------------------------------
Batch: 105
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"foa4cvp0SBT6PcOn...|I stayed here las...|
+--------------------+--------------------+

----------------------------------------

[Stage 5010:>               (0 + 1) / 1][Stage 5011:>               (0 + 0) / 1]                                                                                

-------------------------------------------
Batch: 2416
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"JlWpB49ONTbfEbbK...|New York Pizza Ex...|
+--------------------+--------------------+

-------------------------------------------
Batch: 2423
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"JlWpB49ONTbfEbbK...|New York Pizza Ex...|
+--------------------+--------------------+

100
[0.7649535]
Reject
-------------------------------------------
Batch: 35
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"qgd5H-gy-mUJAAp1...|Not the Fuddrucke...|
+--------------------+--------------------+

----------------

[Stage 5026:>                                                       (0 + 1) / 1]                                                                                

-------------------------------------------
Batch: 109
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"8NBZghVBF5i_sFbe...|I've visited this...|
+--------------------+--------------------+



# Save a results data set TO Parque every x minutes 