In [2]:
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import Tokenizer
from tensorflow.keras.preprocessing.text import Tokenizer as KerasTokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report


AttributeError: module 'inspect' has no attribute 'ArgSpec'

In [None]:
# Create Spark session
appName = "Sentiment Analysis with RNN in Spark"
spark = SparkSession.builder.appName(appName).getOrCreate()


In [None]:
# Read CSV file into DataFrame with automatically inferred schema
tweets_csv = spark.read.csv('C:\Users\DELL\Downloads\sparkproject\sparkproject\project\dataset\tweets.csv', inferSchema=True, header=True)

# Select only "SentimentText" and "Sentiment" column, and cast "Sentiment" column data into integer
data = tweets_csv.select("SentimentText", col("Sentiment").cast("Int").alias("label"))


In [None]:
# Divide data, 70% for training, 30% for testing
dividedData = data.randomSplit([0.7, 0.3])
trainingData = dividedData[0]  
testingData = dividedData[1]

In [None]:
# Define the Keras tokenizer
keras_tokenizer = KerasTokenizer()
# Collect the text data from the DataFrame column and pass it to the tokenizer
texts = trainingData.select("SentimentText").rdd.flatMap(lambda x: x).collect()
keras_tokenizer.fit_on_texts(texts)

# Define vocabulary size
vocab_size = len(keras_tokenizer.word_index) + 1

# Convert text data to sequences
train_sequences = keras_tokenizer.texts_to_sequences(texts)
test_sequences = keras_tokenizer.texts_to_sequences(testingData.select("SentimentText").rdd.flatMap(lambda x: x).collect())


In [None]:
# Pad sequences to ensure uniform length
maxlen = max(len(seq) for seq in train_sequences)
train_data = pad_sequences(train_sequences, maxlen=maxlen)
test_data = pad_sequences(test_sequences, maxlen=maxlen)

# Define the RNN model
embedding_dim = 50
rnn_units = 64

In [None]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen))
model.add(SimpleRNN(units=rnn_units))
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
# Train the model
history=model.fit(train_data, np.array(trainingData.select("label").collect()), epochs=5, batch_size=64, validation_split=0.2)

In [7]:
# Import necessary libraries
import pandas as pd
import numpy as np
#import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Read the CSV file
tweets_csv = pd.read_csv(r'C:\Users\DELL\Downloads\sparkproject\sparkproject\project\dataset\tweets.csv')

# Select relevant columns and convert sentiment to integers
data = tweets_csv[['SentimentText', 'Sentiment']]
data['Sentiment'] = data['Sentiment'].astype(int)

# Tokenization and padding
tokenizer = Tokenizer(num_words=5000, split=' ')
tokenizer.fit_on_texts(data['SentimentText'].values)
X = tokenizer.texts_to_sequences(data['SentimentText'].values)
X = pad_sequences(X)

# Encode target labels
encoder = LabelEncoder()
Y = encoder.fit_transform(data['Sentiment'])

# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

# Define the model
model = Sequential()
model.add(Embedding(5000, 128, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, Y_train, epochs=5, batch_size=128, validation_data=(X_test, Y_test), verbose=2)

# Evaluate the model
score = model.evaluate(X_test, Y_test, verbose=0)
print("Test Accuracy:", score[1])

ImportError: cannot import name 'int4' from 'tensorflow.python.framework.dtypes' (C:\Users\DELL\anaconda3\Lib\site-packages\tensorflow\python\framework\dtypes.py)

In [None]:
model.save("rnn.h5")

In [None]:

# Evaluate the model on test data
predictions_proba = model.predict(test_data)
predictions = (predictions_proba > 0.5).astype(int)

In [None]:
#Get the true labels
true_labels = np.array(testingData.select("label").collect())

# Generate confusion matrix
cm = confusion_matrix(true_labels, predictions)
print("Confusion Matrix:")
print(cm)

# Generate classification report
report = classification_report(true_labels, predictions)
print("Classification Report:")
print(report)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Generate confusion matrix
cm = confusion_matrix(true_labels, predictions)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='d', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()


In [None]:
# Plot loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Plot accuracy
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
from keras.preprocessing.sequence import pad_sequences

# Define the input text data
input_text = ["This is a great movie!",
              "I hate you!."]

# Convert text data to sequences using the Keras tokenizer
input_sequences = keras_tokenizer.texts_to_sequences(input_text)

# Pad sequences to ensure uniform length
maxlen = max(len(seq) for seq in input_sequences)
padded_input_sequences = pad_sequences(input_sequences, maxlen=maxlen)

# Make predictions using the trained model
predictions_proba = model.predict(padded_input_sequences)
predictions = (predictions_proba > 0.5).astype(int)

# Print the predictions
for i, text in enumerate(input_text):
    print(f"Input Text: {text}")
    print(f"Predicted Sentiment: {'Positive' if predictions[i] == 1 else 'Negative'} (Probability: {predictions_proba[i][0]:.4f})")
    print()


In [None]:
spark.stop()