# 1. Install Packages and Import Libraries #

In [1]:
pip install --upgrade pip

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
!pip install pyspark textblob
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf, concat_ws
from pyspark.sql.types import FloatType
from textblob import TextBlob
import pandas as pd
import numpy as np

Defaulting to user installation because normal site-packages is not writeable


In [3]:
spark = SparkSession.builder \
    .appName('Your App') \
    .config('spark.default.parallelism', 100) \
    .config('spark.sql.shuffle.partitions', 100) \
    .getOrCreate()


# 2. Initialize Spark Session and Context #

In [4]:
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("YourAppName") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

# Initialize Spark Context
sc = SparkContext.getOrCreate(SparkConf().setAppName("New App"))

# 3. Read Data #

In [5]:
# Read data from HDFS
test_data = spark.read.csv("hdfs://localhost:9000/user/hduser/CustomerReviews/test/test.csv", header=True, inferSchema=True)
train_data = spark.read.csv("hdfs://localhost:9000/user/hduser/CustomerReviews/train/train.csv", header=True, inferSchema=True)

                                                                                

# 4. Preprocess Data #

In [6]:
# Rename columns in test_data and train_data
test_data = test_data.withColumnRenamed("1", "rating") \
                     .withColumnRenamed("mens ultrasheer", "title") \
                     .withColumnRenamed("This model may be ok for sedentary types, but I'm active and get around alot in my job - consistently found these stockings rolled up down by my ankles! Not Good!! Solution: go with the standard compression stocking, 20-30, stock #114622. Excellent support, stays up and gives me what I need. Both pair of these also tore as I struggled to pull them up all the time. Good riddance/bad investment!", "review_text")

train_data = train_data.withColumnRenamed('3', 'rating') \
                       .withColumnRenamed('more like funchuck', 'title') \
                       .withColumnRenamed('"Gave this to my dad for a gag gift after directing ""Nunsense', 'review_part1') \
                       .withColumnRenamed('"" he got a reall kick out of it!"', 'review_part2')

In [7]:
train_data = train_data.withColumn("review_text", concat_ws(" ", train_data.review_part1, train_data.review_part2))

# 5. Text Sentiment Analysis UDF #

In [8]:
# Define a UDF to compute sentiment score
def sentiment_score(text):
    return TextBlob(text).sentiment.polarity

sentiment_score_udf = udf(sentiment_score, FloatType())

# 6. Add Sentiment Score to DataFrame #

In [9]:
# Add sentiment score as a new column to train_data
train_data_with_sentiment = train_data.withColumn("sentiment_score", sentiment_score_udf(train_data['review_text']))

# 7. Build ML Pipeline #

In [10]:
# Define stages of the pipeline
tokenizer = Tokenizer(inputCol="review_text", outputCol="tokens")
hashingTF = HashingTF(inputCol="tokens", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="tfidf_features")
assembler = VectorAssembler(inputCols=["tfidf_features", "sentiment_score"], outputCol="final_features")
lr = LogisticRegression(labelCol="label", featuresCol="final_features")

# Create pipeline
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, assembler, lr])

### Checks

In [11]:
train_data_with_sentiment.printSchema()

root
 |-- rating: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- review_part1: string (nullable = true)
 |-- review_part2: string (nullable = true)
 |-- review_text: string (nullable = false)
 |-- sentiment_score: float (nullable = true)



In [12]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Sentiment Analysis") \
    .getOrCreate()

df = spark.createDataFrame([(1, "I love learning"), (2, "I hate bugs")], ["id", "text"])

df_with_sentiment = df.withColumn("sentiment", sentiment_score_udf(df["text"]))
df_with_sentiment.show()

                                                                                

+---+---------------+---------+
| id|           text|sentiment|
+---+---------------+---------+
|  1|I love learning|      0.5|
|  2|    I hate bugs|     -0.8|
+---+---------------+---------+



In [13]:
train_data = train_data.withColumnRenamed('rating', 'label')

In [14]:
# Sample 10% of the data for training
sample_fraction = 0.0001
sampled_train_data = train_data.sample(False, sample_fraction)

# Add sentiment score to the sampled data
sampled_train_data_with_sentiment = sampled_train_data.withColumn("sentiment_score", sentiment_score_udf(sampled_train_data['review_text']))

In [15]:
# Show the number of records for sanity check
print("Total records in original DataFrame: ", train_data.count())
print("Total records in sampled DataFrame: ", sampled_train_data.count())

                                                                                

Total records in original DataFrame:  2999999




Total records in sampled DataFrame:  329




# Fit the model #

In [16]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .appName("MyApp") \
                    .config("spark.driver.memory", "6g") \
                    .config("spark.executor.memory", "4g") \
                    .getOrCreate()

In [17]:
df.cache()

DataFrame[id: bigint, text: string]

In [18]:
spark = SparkSession.builder \
    .appName("App") \
    .getOrCreate()

In [19]:
spark.conf.set("spark.sql.shuffle.partitions", 50)

In [20]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder \
    .appName("Sentiment Analysis Optimized") \
    .getOrCreate()

In [21]:
train_data_with_sentiment = train_data.withColumn(
    "sentiment_score",
    F.when(F.col("review_text").like("%good%"), 1)
    .when(F.col("review_text").like("%bad%"), -1)
    .otherwise(0)
)


In [22]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

tokenizer = Tokenizer(inputCol="review_text", outputCol="words")
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="features")
assembler = VectorAssembler(inputCols=["features", "sentiment_score"], outputCol="final_features")
lr = LogisticRegression(labelCol="label", featuresCol="final_features")

pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, assembler, lr])


In [23]:
import torch
import torch.nn as nn

class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), self.hidden_size)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out

# Hyperparameters
input_size = 5
hidden_size = 64
output_size = 1

# Initialize the RNN
model = SimpleRNN(input_size, hidden_size, output_size)


### Text Preprocessing and Feature Extraction ###

In [24]:
tokenizer = Tokenizer(inputCol="review_text", outputCol="tokens")
hashingTF = HashingTF(inputCol="tokens", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="tfidf_features")

### Vector Assembly

In [25]:
assembler = VectorAssembler(inputCols=["tfidf_features", "sentiment_score"], outputCol="final_features")

### Model Definition

In [26]:
lr = LogisticRegression(labelCol="label", featuresCol="final_features")

### Pipeline Creation

In [27]:
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, assembler, lr])

In [28]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, HashingTF
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import Row

# Sample data
train_data = [Row(label=1, text="foo"), Row(label=0, text="bar")]
train_df = spark.createDataFrame(train_data)

# Define stages
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.01)

# Instead of creating a pipeline with all stages, run each stage individually
tokenized_data = tokenizer.transform(train_df)
hashed_data = hashingTF.transform(tokenized_data)

# Check resources and data at each stage
print("After Tokenizer:")
tokenized_data.show()

print("After HashingTF:")
hashed_data.show()

# Finally, fit the model
lr_model = lr.fit(hashed_data)


After Tokenizer:
+-----+----+-----+
|label|text|words|
+-----+----+-----+
|    1| foo|[foo]|
|    0| bar|[bar]|
+-----+----+-----+

After HashingTF:
+-----+----+-----+--------------------+
|label|text|words|            features|
+-----+----+-----+--------------------+
|    1| foo|[foo]|(262144,[215198],...|
|    0| bar|[bar]|(262144,[111892],...|
+-----+----+-----+--------------------+



2023-09-22 14:01:29,110 WARN netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
2023-09-22 14:01:29,110 WARN netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


In [29]:
from pyspark.storagelevel import StorageLevel

# Persist intermediate DataFrames to disk
tokenized_data.persist(StorageLevel.DISK_ONLY)
hashed_data.persist(StorageLevel.DISK_ONLY)

# Unpersist the previous DataFrames to free up memory
train_df.unpersist()
tokenized_data.unpersist()

# Run the next stage
lr_model = lr.fit(hashed_data)

# Unpersist the last DataFrame to free up memory
hashed_data.unpersist()


DataFrame[label: bigint, text: string, words: array<string>, features: vector]

### Train the Model

In [30]:
model = pipeline.fit(sampled_train_data_with_sentiment)

2023-09-22 14:01:42,994 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 20.2 MiB
2023-09-22 14:02:01,304 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 20.2 MiB
2023-09-22 14:02:03,647 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 20.2 MiB
2023-09-22 14:02:25,155 ERROR executor.Executor: Exception in task 1.0 in stage 63.0 (TID 248)
java.lang.OutOfMemoryError: GC overhead limit exceeded
	at java.lang.AbstractStringBuilder.<init>(AbstractStringBuilder.java:68)
	at java.lang.StringBuilder.<init>(StringBuilder.java:106)
	at java.io.ObjectInputStream$BlockDataInputStream.readUTFBody(ObjectInputStream.java:3561)
	at java.io.ObjectInputStream$BlockDataInputStream.readUTF(ObjectInputStream.java:3377)
	at java.io.ObjectInputStream.readString(ObjectInputStream.java:2049)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1651)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2431)
	at java.i

2023-09-22 14:02:25,347 WARN storage.BlockManager: Putting block rdd_227_3 failed due to exception org.apache.spark.TaskKilledException.
2023-09-22 14:02:25,356 WARN storage.BlockManager: Block rdd_227_2 could not be removed as it was not found on disk or in memory
2023-09-22 14:02:25,350 WARN storage.BlockManager: Putting block rdd_227_5 failed due to exception org.apache.spark.TaskKilledException.
2023-09-22 14:02:25,920 WARN storage.BlockManager: Block rdd_227_5 could not be removed as it was not found on disk or in memory
2023-09-22 14:02:25,925 WARN storage.BlockManager: Block rdd_227_3 could not be removed as it was not found on disk or in memory
2023-09-22 14:02:26,003 WARN storage.BlockManager: Putting block rdd_227_0 failed due to exception org.apache.spark.TaskKilledException.
2023-09-22 14:02:26,005 WARN storage.BlockManager: Block rdd_227_0 could not be removed as it was not found on disk or in memory
2023-09-22 14:02:26,012 ERROR util.Instrumentation: org.apache.spark.Spar

2023-09-22 14:02:26,120 WARN scheduler.TaskSetManager: Lost task 2.0 in stage 63.0 (TID 249) (fabio-poli-vm executor driver): TaskKilled (Stage cancelled)
2023-09-22 14:02:26,154 WARN scheduler.TaskSetManager: Lost task 0.0 in stage 63.0 (TID 247) (fabio-poli-vm executor driver): TaskKilled (Stage cancelled)
2023-09-22 14:02:26,214 WARN scheduler.TaskSetManager: Lost task 5.0 in stage 63.0 (TID 252) (fabio-poli-vm executor driver): TaskKilled (Stage cancelled)
2023-09-22 14:02:26,214 WARN scheduler.TaskSetManager: Lost task 3.0 in stage 63.0 (TID 250) (fabio-poli-vm executor driver): TaskKilled (Stage cancelled)
2023-09-22 14:02:26,893 WARN storage.BlockManager: Putting block rdd_227_7 failed due to exception org.apache.spark.TaskKilledException.
2023-09-22 14:02:26,904 WARN storage.BlockManager: Block rdd_227_7 could not be removed as it was not found on disk or in memory
2023-09-22 14:02:26,908 WARN scheduler.TaskSetManager: Lost task 7.0 in stage 63.0 (TID 254) (fabio-poli-vm execut

Py4JJavaError: An error occurred while calling o206.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 4 in stage 63.0 failed 1 times, most recent failure: Lost task 4.0 in stage 63.0 (TID 251) (fabio-poli-vm executor driver): java.lang.OutOfMemoryError: Java heap space
	at java.io.ObjectInputStream$HandleTable$HandleList.add(ObjectInputStream.java:4018)
	at java.io.ObjectInputStream$HandleTable.markDependency(ObjectInputStream.java:3837)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2433)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2355)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2213)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1669)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2431)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2355)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2213)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1669)
	at java.io.ObjectInputStream.readArray(ObjectInputStream.java:2119)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1657)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2431)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2355)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2213)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1669)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2431)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2355)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2213)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1669)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2431)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2355)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2213)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1669)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2431)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2355)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2213)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1669)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2431)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2355)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2213)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1669)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2450)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2399)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2398)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2398)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1156)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1156)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1156)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2638)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2580)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2569)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:938)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2224)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2319)
	at org.apache.spark.rdd.RDD.$anonfun$fold$1(RDD.scala:1183)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.fold(RDD.scala:1177)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$1(RDD.scala:1246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1222)
	at org.apache.spark.ml.optim.loss.RDDLossFunction.calculate(RDDLossFunction.scala:61)
	at org.apache.spark.ml.optim.loss.RDDLossFunction.calculate(RDDLossFunction.scala:47)
	at breeze.optimize.CachedDiffFunction.calculate(CachedDiffFunction.scala:24)
	at breeze.optimize.FirstOrderMinimizer.calculateObjective(FirstOrderMinimizer.scala:50)
	at breeze.optimize.FirstOrderMinimizer.initialState(FirstOrderMinimizer.scala:44)
	at breeze.optimize.FirstOrderMinimizer.iterations(FirstOrderMinimizer.scala:96)
	at org.apache.spark.ml.classification.LogisticRegression.trainImpl(LogisticRegression.scala:999)
	at org.apache.spark.ml.classification.LogisticRegression.$anonfun$train$1(LogisticRegression.scala:628)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:495)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:286)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:151)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:115)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)
Caused by: java.lang.OutOfMemoryError: Java heap space
	at java.io.ObjectInputStream$HandleTable$HandleList.add(ObjectInputStream.java:4018)
	at java.io.ObjectInputStream$HandleTable.markDependency(ObjectInputStream.java:3837)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2433)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2355)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2213)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1669)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2431)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2355)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2213)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1669)
	at java.io.ObjectInputStream.readArray(ObjectInputStream.java:2119)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1657)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2431)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2355)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2213)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1669)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2431)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2355)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2213)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1669)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2431)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2355)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2213)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1669)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2431)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2355)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2213)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1669)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2431)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2355)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2213)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1669)


### Model Evaluation

In [None]:
predictions = model.transform(test_data)

### Evaluation Metrics

### Overfitting check

In [None]:
loss = history.history['loss']
val_loss = history.history['val_loss']

plt.plot(loss , 'b' , label = 'train loss')
plt.plot(val_loss , 'orange' , label = 'validation loss')

plt.legend()
plt.show()

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

plt.plot(acc , 'b' , label = 'train accuracy')
plt.plot(val_acc , 'orange' , label = 'validation accuracy')

plt.legend()
plt.show()