# Consumer 2

## Set Up

In [39]:
from pyspark.sql import SparkSession

# Spark session & context
spark = (SparkSession
         .builder
         .master('local')
         .appName('json-changes-event-consumer')
         # Add kafka package
         .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2,mysql:mysql-connector-java:8.0.11")
         .getOrCreate())

sc = spark.sparkContext

-------------------------------------------
Batch: 9491
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"CuSn1JFQTEt2cK_C...|I went to New Cit...|
+--------------------+--------------------+



In [40]:
# Create stream dataframe setting kafka server, topic and offset option
df = (spark
  .readStream
  .format("kafka")
  .option("kafka.bootstrap.servers", "185.185.126.143:9092") # kafka server
  .option("subscribe", "ML_Topic") # topic name matching the producer 
  .option("startingOffsets", "latest") # start from beginning select  "latest" or earliest
  .load())

In [41]:
from pyspark.sql.types import StringType

# Convert binary to string key and value
df1 = (df
    .withColumn("key", df["key"].cast(StringType()))
    .withColumn("value", df["value"].cast(StringType())))

In [42]:
#Comma seprated is dangerous because text column has many commas. 
df2=df1\
      .selectExpr("split(value,',')[0] as review_id" \
                  ,"split(value,',')[1] as text")

#clearing checkpoints 
spark.cleaner.referenceTracking.cleanCheckpoints  true

or 

rm -f . . ipynb_checkpoints/  

In [43]:
#check data flow 
df2.writeStream.outputMode("append") \
            .format("console") \
            .start()

21/07/30 12:16:44 WARN StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-ffc4085d-d23b-4148-b685-c0c4ff682740. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.


<pyspark.sql.streaming.StreamingQuery at 0x7fe7d91d73a0>

-------------------------------------------
Batch: 9492
-------------------------------------------
-------------------------------------------
Batch: 0
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"rRQCXz5JXCAJXZqs...|My budgie was sic...|
+--------------------+--------------------+

+---------+----+
|review_id|text|
+---------+----+
+---------+----+



[Stage 9494:>                                                       (0 + 1) / 1]                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"s82DoR3FSIWbcmrK...|I have so many wa...|
+--------------------+--------------------+

-------------------------------------------
Batch: 9493
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"s82DoR3FSIWbcmrK...|I have so many wa...|
+--------------------+--------------------+



# Model Components 

In [None]:
pip install tensorflow

-------------------------------------------
Batch: 9496
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"N66pdRhzayxvq-5H...|We took our dog L...|
+--------------------+--------------------+

-------------------------------------------
Batch: 4
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"N66pdRhzayxvq-5H...|We took our dog L...|
+--------------------+--------------------+

-------------------------------------------
Batch: 9497
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"YP7UT1pILhR_QeuW...|Have been twice a...|
+--------------------+--------------------+

----------------------------------------

[Stage 9510:>                                                       (0 + 1) / 1]                                                                                

-------------------------------------------
Batch: 9
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"0DEp3FSfy3vRwunM...|I've only been he...|
+--------------------+--------------------+

-------------------------------------------
Batch: 9501
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"0DEp3FSfy3vRwunM...|I've only been he...|
+--------------------+--------------------+



In [27]:
import os
import pickle
import pandas as pd
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf 

In [28]:
# Recreate the exact same model, including its weights and the optimizer
model = tf.keras.models.load_model('NLP_Comments_Classification_20200531.h5')

# Show the model architecture
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           80000     
_________________________________________________________________
bidirectional (Bidirectional (None, 64)                12544     
_________________________________________________________________
dense (Dense)                (None, 24)                1560      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 25        
Total params: 94,129
Trainable params: 94,129
Non-trainable params: 0
_________________________________________________________________
-------------------------------------------
Batch: 3264
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"TlUwbT3IuHQ6cu9S

[Stage 3265:>                                                       (0 + 1) / 1]                                                                                

-------------------------------------------
Batch: 3265
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"GMr7pPENsEEwLTH1...|Via Lago is a gre...|
+--------------------+--------------------+



In [45]:
# loading
with open('tokenizer_X_20200531.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

with open('tokenizer_Y_20200531.pickle', 'rb') as handle:
    label_tokenizer = pickle.load(handle)

In [46]:
#Should output three fields: Probability , the outcome and the full message 
##list with three elements 

def modelevdh(textbody) : 
        phrase=list([textbody])
        x = tokenizer.texts_to_sequences(phrase)
        paddedx = pad_sequences(x, maxlen=120, padding='post', truncating='post')
        probability=model.predict_proba(paddedx).max(axis=1) 
       # print(phrase)
        if probability==0:
            my_prediction= "No Classification"
        elif probability<=.3:
            my_prediction="Accepted"
        elif probability>=.7:
            my_prediction="Reject"
        else:
            my_prediction="Neutral"
        #print(probability[0])  
        return( probability , my_prediction   ,my_prediction + "  --->  Probability of Reject is: " + "{0:.0%}".format(probability[0]))

-------------------------------------------
Batch: 9502
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"_r28zHxLg1m_m9mp...|J. GIlbert's has ...|
+--------------------+--------------------+

-------------------------------------------
Batch: 10
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"_r28zHxLg1m_m9mp...|J. GIlbert's has ...|
+--------------------+--------------------+



[Stage 9514:>                                                       (0 + 1) / 1]                                                                                

-------------------------------------------
Batch: 9503
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"_SpjTgW_h48I5CBI...|We were staying a...|
+--------------------+--------------------+

-------------------------------------------
Batch: 11
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"_SpjTgW_h48I5CBI...|We were staying a...|
+--------------------+--------------------+



In [47]:
#Should output three fields: Probability , the outcome and the full message 
##list with three elements 

def myprobability(textbody) : 
        phrase=list([textbody.decode('utf-8')])
        x = tokenizer.texts_to_sequences(phrase)
        paddedx = pad_sequences(x, maxlen=120, padding='post', truncating='post')
        probability=model.predict_proba(paddedx).max(axis=1) 
       # print(phrase)
        if probability==0:
            my_prediction= "No Classification"
        elif probability<=.3:
            my_prediction="Accepted"
        elif probability>=.7:
            my_prediction="Reject"
        else:
            my_prediction="Neutral"
        #print(probability[0])  
        return(probability)
    
def myprediction(textbody) : 
        phrase=list([textbody.decode('utf-8')])
        x = tokenizer.texts_to_sequences(phrase)
        paddedx = pad_sequences(x, maxlen=120, padding='post', truncating='post')
        probability=model.predict_proba(paddedx).max(axis=1) 
       # print(phrase)
        if probability==0:
            my_prediction= "No Classification"
        elif probability<=.3:
            my_prediction="Accepted"
        elif probability>=.7:
            my_prediction="Reject"
        else:
            my_prediction="Neutral"
        #print(probability[0])  
        return(my_prediction)

-------------------------------------------
Batch: 12
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"kl0aK2Yp4orCUZ_d...|So I was in town ...|
+--------------------+--------------------+

-------------------------------------------
Batch: 9504
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"kl0aK2Yp4orCUZ_d...|So I was in town ...|
+--------------------+--------------------+

-------------------------------------------
Batch: 13
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"0f4aJAsBmiGGcySt...|It was crazy busy...|
+--------------------+--------------------+

-----------------------------------------

In [48]:
from pyspark.sql.functions import udf
myprobability_udf = udf(myprobability, StringType())
myprediction_udf = udf(myprediction, StringType())

df2 = df2.withColumn("probability", myprobability_udf("text"))
df2 = df2.withColumn("prediction", myprediction_udf("text"))

Traceback (most recent call last):
  File "/usr/local/spark/python/pyspark/serializers.py", line 437, in dumps
    return cloudpickle.dumps(obj, pickle_protocol)
  File "/usr/local/spark/python/pyspark/cloudpickle/cloudpickle_fast.py", line 72, in dumps
    cp.dump(obj)
  File "/usr/local/spark/python/pyspark/cloudpickle/cloudpickle_fast.py", line 540, in dump
    return Pickler.dump(self, obj)
TypeError: cannot pickle 'weakref' object


PicklingError: Could not serialize object: TypeError: cannot pickle 'weakref' object

-------------------------------------------
Batch: 9506
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"CuSn1JFQTEt2cK_C...|I went to New Cit...|
+--------------------+--------------------+

-------------------------------------------
Batch: 14
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"CuSn1JFQTEt2cK_C...|I went to New Cit...|
+--------------------+--------------------+

-------------------------------------------
Batch: 9507
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"b7gZ5MZLOP0HZLUj...|Breweries seem to...|
+--------------------+--------------------+

---------------------------------------

-------------------------------------------
Batch: 27
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"u4uT4HPhob3BkBAI...|Love this nail sa...|
+--------------------+--------------------+

-------------------------------------------
Batch: 9519
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"u4uT4HPhob3BkBAI...|Love this nail sa...|
+--------------------+--------------------+

-------------------------------------------
Batch: 28
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"0f4aJAsBmiGGcySt...|It was crazy busy...|
+--------------------+--------------------+

-----------------------------------------

In [None]:
#check data flow 
df2.writeStream.outputMode("append") \
            .format("console") \
            .start()

## Stream Through ML Model ( Reading from kafka from python to consume) 

In [None]:
##the data is continuous ??
# 1 avoid intergrating spark with Ml model . 
# 2 how to stream using python api ??


#predicting the streaming kafka messages
from kafka import KafkaConsumer

consumer = KafkaConsumer('ML_Topic',bootstrap_servers=['185.185.126.143:9092'])

print("Starting ML predictions.")


for message in consumer:
    ModelOutcome = modelevdh(message.value.decode('utf-8')) 
    print(message.value.decode('utf-8').split(",")[0])
    print(message.value.decode('utf-8').split(",")[1])
    print(ModelOutcome[0])
    print(ModelOutcome[1])
 

In [None]:
import pandas as pd
#predicting the streaming kafka messages
from kafka import KafkaConsumer

consumer = KafkaConsumer('ML_Topic',bootstrap_servers=['185.185.126.143:9092'])

print("Starting ML predictions.")


for message in consumer:
    ModelOutcome = modelevdh(message.value.decode('utf-8')) 
    a=message.value.decode('utf-8').split(",")[0]
    b=message.value.decode('utf-8').split(",")[1]
    c=ModelOutcome[0]
    d=ModelOutcome[1]
    messagesdict = {"message_key": a,"message_text": b,"model_score": c,"model_outcome": d}

dff = pd.DataFrame.from_dict(messagesdict)
print(dff)
     

In [22]:
print(df)

DataFrame[key: binary, value: binary, topic: string, partition: int, offset: bigint, timestamp: timestamp, timestampType: int]
-------------------------------------------
Batch: 155
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"1GqVyJsXMKAIbywa...|I've eaten here a...|
+--------------------+--------------------+

-------------------------------------------
Batch: 156
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"kBudygoaz2CbKLc_...|Love this place! ...|
+--------------------+--------------------+

-------------------------------------------
Batch: 157
-------------------------------------------
+--------------------+--------------------+
|           review_id|                text|
+--------------------+--------------------+
|"

# Alert assesment of scores>80% 

# Save all results data set TO Parque file every x minutes 