In [None]:
import pandas as pd
import numpy as np
import re
import tqdm as tqdm
import string
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob
from nltk.stem.snowball import SnowballStemmer
nltk.download('stopwords')
nltk.download('punkt')
import matplotlib.pyplot as plt
import wordcloud
from wordcloud import WordCloud
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer



import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
import nltk
nltk.download('wordnet')

In [None]:
data = pd.read_excel("A1_dataset.xlsx")

In [None]:
data.head()

In [None]:
apunctuations = string.punctuation
def remove_punct(sent):
    x="".join([letter for letter in sent if letter not in punctuations])
    return(x)

In [None]:
def remove_white_spaces(sent):
    x = re.sub(r'\s+',' ',sent)#will remove extra spaces
    return(x)

In [None]:
def tokenization(sent):
    tokens = word_tokenize(sent)
    return tokens

In [None]:
def remove_stop_words(tokens):
    good_words = []
    for word in tokens:
        match = re.match('(?!no)\w{1,2}\\b',word+' ')
        x = bool(match)
        if x is not True :
            good_words.append(word)
    row = ' '.join(good_words)
    return row

In [None]:
def remove_url_html(sent):
    pat = re.compile("@[a-zA-Z0-9_]+")#atleast one character should be present after @ and valid username contains alphabets,digits and _
    cl_pat = re.sub(pat, ' ', sent)
    pat = re.compile("www.\S+")
    cl_pat = re.sub(pat, ' ', cl_pat)
    pat = re.compile("<.*?>")#html
    cl_pat = re.sub(pat, ' ', cl_pat)
    pat = re.compile("https?://\S+")#url
    cl_pat = re.sub(pat, ' ', cl_pat)
    return str(cl_pat)

In [None]:
def spelling_correction(sent):
    corr = TextBlob(sent)
    sent_corr = corr.correct()
    return str(sent_corr)

In [None]:
def lemmatizing(row):
    lemmatizer = WordNetLemmatizer()
    lemm_words=[]
    for word in row.split(" "):

        lemm_words.append(lemmatizer.lemmatize(word))

    lemmed = ' '.join(lemm_words)
    return lemmed

In [None]:
def preprocessing_pipeline(row):
    row = remove_url_html(row)
    row = remove_punct(row)
    row = remove_white_spaces(row)
    words = tokenization(row)
    row = remove_stop_words(words)
    row = spelling_correction(row)
    row = lemmatizing(row)
    return row

In [None]:
def text_preprocessing_on_one_sentence(sentence):
    sentence = remove_url_html(sentence)
    print("after url-html removal :",sentence)
    sentence = remove_punct(sentence)
    print("after punctuation removal :",sentence)
    sentence = remove_white_spaces(sentence)
    print("after white space removal :",sentence)
    words = tokenization(sentence)
    print("after tokenization :",words)
    sentence = remove_stop_words(words)
    print("after stop words removal :",sentence)
    sentence = spelling_correction(sentence)
    print("after spelling  correction :",sentence)
    sentence = lemmatizing(sentence)
    print("after lemmatizing :",sentence)

In [None]:
print(data.loc[322]) #sentence belonging to class 1

In [None]:
sentence = data.loc[322,'TEXT']
print("Before preprocessing \n",sentence)
text_preprocessing_on_one_sentence(sentence)

In [None]:
print(data.loc[865]) #sentence belonging to class 0

In [None]:
sentence = data.loc[865,'TEXT']
print("Before preprocessing \n",sentence)
text_preprocessing_on_one_sentence(sentence)

In [None]:
for i in tqdm.tqdm(data.index):
    data.loc[i,'TEXT'] = preprocessing_pipeline(data.loc[i,'TEXT'])

In [None]:
data.head()

In [None]:
text_pos = " ".join(data.loc[i,'TEXT'] for i in data.index if data.loc[i,'LABEL']==1)
text_neg = " ".join(data.loc[i,'TEXT'] for i in data.index if data.loc[i,'LABEL']==0)

In [None]:
word_cloud = WordCloud().generate(text_pos)

In [None]:
plt.imshow(word_cloud)
plt.axis("off")
plt.show()

In [None]:
word_cloud = WordCloud().generate(text_neg)
plt.imshow(word_cloud)
plt.axis("off")
plt.show()

In [None]:
Max_features = 2000000 #maximum words in vocab


In [None]:
vectorizer = TextVectorization(max_tokens=Max_features,
                               output_sequence_length=350,
                               output_mode='int')

In [None]:
vectorizer.adapt(data['TEXT'].values)

In [None]:
vectorized_text = vectorizer(data['TEXT'].values)

In [None]:
vectorized_text

In [None]:
labels= data['LABEL'].values
labels

In [None]:
tf.random.set_seed(42)

model11=tf.keras.Sequential([
    tf.keras.layers.Dense(32,activation="relu"),
    tf.keras.layers.Dense(16,activation="relu"),
    tf.keras.layers.Dense(8,activation="relu"),
    tf.keras.layers.Dense(1,activation="sigmoid")
])

model11.compile(loss="binary_crossentropy",
               optimizer='Adam',
               metrics=["accuracy"])


#create a learning rate callback
lr_scheduler=tf.keras.callbacks.LearningRateScheduler(lambda epoch: 1e-4 * 10**(epoch/20))

history11=model11.fit(vectorized_text,labels,epochs=100,callbacks=[lr_scheduler])

In [None]:
pd.DataFrame(history11.history).plot()

In [None]:
tf.random.set_seed(42)

model1=tf.keras.Sequential([
    tf.keras.layers.Dense(32,activation="relu"),
    tf.keras.layers.Dense(16,activation="relu"),
    tf.keras.layers.Dense(8,activation="relu"),
    tf.keras.layers.Dense(1,activation="sigmoid")
])

model1.compile(loss="binary_crossentropy",
               optimizer=tf.keras.optimizers.Adam(lr=0.01),
               metrics=["accuracy"])


#create a learning rate callback
lr_scheduler=tf.keras.callbacks.LearningRateScheduler(lambda epoch: 1e-4 * 10**(epoch/20))

history1=model11.fit(vectorized_text,labels,epochs=100,callbacks=[lr_scheduler])

In [None]:
# create a confusion matrix
from sklearn.metrics import confusion_matrix

# Make predictions
y_pred = model11.predict(vectorized_text[500:1500])

In [None]:
confusion_matrix(labels[500:1500],tf.round(y_pred))

In [None]:
#MCSHBAP - map, chache, shuffle, batch, prefetch  from_tensor_slices, list_file
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, labels))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(64)
dataset = dataset.prefetch(64)

In [None]:
train = dataset.take(int(len(dataset)*.7))
test = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.3))

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding,Input

In [None]:
model = Sequential()
# Create the embedding layer
model.add(Embedding(Max_features+1, 32))
# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
# Final layer
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam',metrics=["accuracy"])

In [None]:
model.summary()

In [None]:
history = model.fit(train, epochs=20, validation_data=test)

In [None]:
plt.figure(figsize=(15,7))
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'Validation'], loc='upper left')
plt.show()

In [None]:
plt.figure(figsize=(15,7))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Loss Function')
plt.ylabel('Loss')
plt.xlabel('epoch')
plt.legend(['train', 'Validation'], loc='upper left')
plt.show()input='I am not unhappy person'

In [None]:
results = model.evaluate(test)

In [None]:
model.save('analysis.h5')

In [None]:
model = tf.keras.models.load_model('/content/analysis.h5')

In [None]:
input='I am not unhappy person'

In [None]:
print("Before preprocessing \n",input)
text_preprocessing_on_one_sentence(input)

In [None]:
res=model.predict(vectorized_text)
k=res.round()

In [None]:
classes_name=['Negative Comment',' Positive Comment']

In [None]:
classes_name[int(k[0][0])]

In [None]:
import gradio as gr
from transformers import pipeline

# Load sentiment analysis model
sentiment_analysis = pipeline("sentiment-analysis")

def analyze_sentiment(text):
    result = sentiment_analysis(text)[0]
    label = result['label']

    return f'Sentiment: {label}'

# Create Gradio interface
iface = gr.Interface(fn=analyze_sentiment, inputs="text", outputs="text", title="Sentiment Analysis")
iface.launch()