In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
import gensim
from fuzzywuzzy import fuzz
import itertools
import matplotlib.pyplot as plt
import time

# NLTK Data
nltk.download('stopwords')

# Settings
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
SEQUENCE_LENGTH = 300
EPOCHS = 8
BATCH_SIZE = 1024
TRAIN_SIZE = 0.8
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10
SENTIMENT_THRESHOLDS = (0.4, 0.7)
KERAS_MODEL = "model.h5"
WORD2VEC_MODEL = "model.w2v"
TOKENIZER_MODEL = "tokenizer.pkl"
ENCODER_MODEL = "encoder.pkl"

# Load Dataset
df = pd.read_csv('/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv', names=["id", "category", "label", "text"])

# Preprocess Text
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

def preprocess(text, stem=False):
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

df.text = df.text.apply(lambda x: preprocess(x))

df_train, df_test = train_test_split(df, test_size=1-TRAIN_SIZE, random_state=42)

# Word2Vec Model
documents = [_text.split() for _text in df_train.text]

w2v_model = gensim.models.Word2Vec(size=W2V_SIZE,
                                   window=W2V_WINDOW,
                                   min_count=W2V_MIN_COUNT,
                                   workers=8)
w2v_model.build_vocab(documents)
w2v_model.train(documents, total_examples=len(documents), epochs=W2V_EPOCH)

print("Vocab size", len(w2v_model.wv.vocab.keys()))
print(w2v_model.most_similar("love"))

# Tokenization and Label Encoding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train.text)
vocab_size = len(tokenizer.word_index) + 1

x_train = pad_sequences(tokenizer.texts_to_sequences(df_train.text), maxlen=SEQUENCE_LENGTH)
x_test = pad_sequences(tokenizer.texts_to_sequences(df_test.text), maxlen=SEQUENCE_LENGTH)

labels = df_train.label.unique().tolist()
encoder = LabelEncoder()
encoder.fit(df_train.label.tolist())

y_train = encoder.transform(df_train.label.tolist())
y_test = encoder.transform(df_test.label.tolist())
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

# Embedding Layer
embedding_matrix = np.zeros((vocab_size, W2V_SIZE))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

embedding_layer = Embedding(vocab_size, W2V_SIZE, weights=[embedding_matrix], input_length=SEQUENCE_LENGTH, trainable=False)

# Build and Compile the Neural Network Model
model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.5))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy'])
model.summary()

# Training the Model
callbacks = [ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
             EarlyStopping(monitor='val_acc', min_delta=1e-4, patience=5)]

history = model.fit(x_train, y_train,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    validation_split=0.1,
                    verbose=1,
                    callbacks=callbacks)

# Model Evaluation
score = model.evaluate(x_test, y_test, batch_size=BATCH_SIZE)
print("ACCURACY:", score[1])
print("LOSS:", score[0])

# Plot training history
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'b', label='Training acc')
plt.plot(epochs, val_acc, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

# Predict and Decode Sentiment
def decode_sentiment(score, include_neutral=True):
    if include_neutral:
        if score <= SENTIMENT_THRESHOLDS[0]:
            return "NEGATIVE"
        elif score >= SENTIMENT_THRESHOLDS[1]:
            return "POSITIVE"
        else:
            return "NEUTRAL"
    else:
        return "NEGATIVE" if score < 0.5 else "POSITIVE"

def predict(text, include_neutral=True):
    start_at = time.time()
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=SEQUENCE_LENGTH)
    score = model.predict([x_test])[0]
    label = decode_sentiment(score, include_neutral=include_neutral)
    return {"label": label, "score": float(score), "elapsed_time": time.time() - start_at}

print(predict("I love the music"))
print(predict("I hate the rain"))
print(predict("I don't know what I'm doing"))

# Confusion Matrix and Classification Report
y_pred_1d = [decode_sentiment(score, include_neutral=False) for score in model.predict(x_test, verbose=1, batch_size=8000)]
y_test_1d = list(df_test.label)

def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues):
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=30)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90, fontsize=22)
    plt.yticks(tick_marks, classes, fontsize=22)
    fmt = '.2f'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.ylabel('True label', fontsize=25)
    plt.xlabel('Predicted label', fontsize=25)

cnf_matrix = confusion_matrix(y_test_1d, y_pred_1d)
plt.figure(figsize=(12, 12))
plot_confusion_matrix(cnf_matrix, classes=df_train.label.unique(), title="Confusion matrix")
plt.show()

print(classification_report(y_test_1d, y_pred_1d))
print("Accuracy Score:", accuracy_score(y_test_1d, y_pred_1d))

# Fuzzy Logic Integration using fuzzywuzzy
def fuzzy_sentiment_analysis(score):
    low = fuzz.ratio(score, 'NEGATIVE')
    medium = fuzz.ratio(score, 'NEUTRAL')
    high = fuzz.ratio(score, 'POSITIVE')
    fuzzy_score = np.zeros_like(score)
    fuzzy_score[score < SENTIMENT_THRESHOLDS[0]] = low
    fuzzy_score[(score >= SENTIMENT_THRESHOLDS[0]) & (score <= SENTIMENT_THRESHOLDS[1])] = medium
    fuzzy_score[score > SENTIMENT_THRESHOLDS[1]] = high
    return fuzzy_score

# Example fuzzy analysis
text = "I love the music"
prediction = predict(text)
fuzzy_score = fuzzy_sentiment_analysis(prediction["score"])
print(f"Fuzzy analysis for '{text}': {fuzzy_score}")


In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
import gensim
from fuzzywuzzy import fuzz
import itertools
import matplotlib.pyplot as plt
import time

# NLTK Data
nltk.download('stopwords')

# Settings
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
SEQUENCE_LENGTH = 300
EPOCHS = 8
BATCH_SIZE = 1024
TRAIN_SIZE = 0.8
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10
SENTIMENT_THRESHOLDS = (0.4, 0.7)
KERAS_MODEL = "model.h5"
WORD2VEC_MODEL = "model.w2v"
TOKENIZER_MODEL = "tokenizer.pkl"
ENCODER_MODEL = "encoder.pkl"

# Load Dataset
df = pd.read_csv('/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv', names=["id", "category", "label", "text"])

# Preprocess Text
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

def preprocess(text, stem=False):
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

df.text = df.text.apply(lambda x: preprocess(x))

df_train, df_test = train_test_split(df, test_size=1-TRAIN_SIZE, random_state=42)

# Word2Vec Model
documents = [_text.split() for _text in df_train.text]

w2v_model = gensim.models.Word2Vec(size=W2V_SIZE,
                                   window=W2V_WINDOW,
                                   min_count=W2V_MIN_COUNT,
                                   workers=8)
w2v_model.build_vocab(documents)
w2v_model.train(documents, total_examples=len(documents), epochs=W2V_EPOCH)

print("Vocab size", len(w2v_model.wv.vocab.keys()))
print(w2v_model.most_similar("love"))

# Tokenization and Label Encoding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train.text)
vocab_size = len(tokenizer.word_index) + 1

x_train = pad_sequences(tokenizer.texts_to_sequences(df_train.text), maxlen=SEQUENCE_LENGTH)
x_test = pad_sequences(tokenizer.texts_to_sequences(df_test.text), maxlen=SEQUENCE_LENGTH)

labels = df_train.label.unique().tolist()
encoder = LabelEncoder()
encoder.fit(df_train.label.tolist())

y_train = encoder.transform(df_train.label.tolist())
y_test = encoder.transform(df_test.label.tolist())
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

# Embedding Layer
embedding_matrix = np.zeros((vocab_size, W2V_SIZE))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

embedding_layer = Embedding(vocab_size, W2V_SIZE, weights=[embedding_matrix], input_length=SEQUENCE_LENGTH, trainable=False)

# Build and Compile the Neural Network Model
model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.5))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy'])
model.summary()

# Training the Model
callbacks = [ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
             EarlyStopping(monitor='val_acc', min_delta=1e-4, patience=5)]

history = model.fit(x_train, y_train,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    validation_split=0.1,
                    verbose=1,
                    callbacks=callbacks)

# Model Evaluation
score = model.evaluate(x_test, y_test, batch_size=BATCH_SIZE)
print("ACCURACY:", score[1])
print("LOSS:", score[0])

# Plot training history
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'b', label='Training acc')
plt.plot(epochs, val_acc, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

# Predict and Decode Sentiment
def decode_sentiment(score, include_neutral=True):
    if include_neutral:
        if score <= SENTIMENT_THRESHOLDS[0]:
            return "NEGATIVE"
        elif score >= SENTIMENT_THRESHOLDS[1]:
            return "POSITIVE"
        else:
            return "NEUTRAL"
    else:
        return "NEGATIVE" if score < 0.5 else "POSITIVE"

def predict(text, include_neutral=True):
    start_at = time.time()
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=SEQUENCE_LENGTH)
    score = model.predict([x_test])[0]
    label = decode_sentiment(score, include_neutral=include_neutral)
    return {"label": label, "score": float(score), "elapsed_time": time.time() - start_at}

print(predict("I love the music"))
print(predict("I hate the rain"))
print(predict("I don't know what I'm doing"))

# Confusion Matrix and Classification Report
y_pred_1d = [decode_sentiment(score, include_neutral=False) for score in model.predict(x_test, verbose=1, batch_size=8000)]
y_test_1d = list(df_test.label)

def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues):
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=30)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90, fontsize=22)
    plt.yticks(tick_marks, classes, fontsize=22)
    fmt = '.2f'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.ylabel('True label', fontsize=25)
    plt.xlabel('Predicted label', fontsize=25)

cnf_matrix = confusion_matrix(y_test_1d, y_pred_1d)
plt.figure(figsize=(12, 12))
plot_confusion_matrix(cnf_matrix, classes=df_train.label.unique(), title="Confusion matrix")
plt.show()

print(classification_report(y_test_1d, y_pred_1d))
print("Accuracy Score:", accuracy_score(y_test_1d, y_pred_1d))

# Fuzzy Logic Integration using fuzzywuzzy
def fuzzy_sentiment_analysis(score):
    low = fuzz.ratio(score, 'NEGATIVE')
    medium = fuzz.ratio(score, 'NEUTRAL')
    high = fuzz.ratio(score, 'POSITIVE')
    fuzzy_score = np.zeros_like(score)
    fuzzy_score[score < SENTIMENT_THRESHOLDS[0]] = low
    fuzzy_score[(score >= SENTIMENT_THRESHOLDS[0]) & (score <= SENTIMENT_THRESHOLDS[1])] = medium
    fuzzy_score[score > SENTIMENT_THRESHOLDS[1]] = high
    return fuzzy_score

# Example fuzzy analysis
text = "I love the music"
prediction = predict(text)
fuzzy_score = fuzzy_sentiment_analysis(prediction["score"])
print(f"Fuzzy analysis for '{text}': {fuzzy_score}")
