In [None]:
import pandas as pd
import tensorflow 
from tensorflow.keras.layers import *
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import nltk
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
import spacy
import re
import sklearn
from sklearn.model_selection import train_test_split
import tqdm
nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))
from gensim.models import Word2Vec
from tensorflow.keras import Sequential
import numpy as np
from tensorflow.keras.utils import to_categorical
from sklearn.pipeline import make_pipeline

In [None]:
columns  = ["sentiment", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "latin-1"
dataset = pd.read_csv('/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv',
                      encoding=DATASET_ENCODING , names=columns).sample(frac=1)

dataset = dataset[['sentiment','text']]

dataset['sentiment'] = dataset['sentiment'].replace(4,1)



In [None]:
dataset[dataset['sentiment']==0]

In [None]:
dataset['sentiment'].unique()

In [None]:
plt.rcParams['font.family'] = 'DejaVu Sans'
sentiment_counts = dataset['sentiment'].value_counts()
sentiment_labels = {'Negative': 0, 'Positive': 1}
plt.figure(figsize=(8, 6))
plt.barh(list(sentiment_labels.keys()), list(sentiment_counts.values), color=['red', 'green', 'blue'])
plt.xlabel("Frequency")
plt.yticks(list(sentiment_labels.values()), list(sentiment_labels.keys()))
plt.title("Sentiment Frequencies")
plt.grid(axis='x', linestyle='--', alpha=0.6)

plt.show()

In [None]:
dataset.isna().sum()

In [None]:
x,y=dataset['text'],dataset['sentiment']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,shuffle=True)

In [None]:
len(x_train),len(x_test)

In [None]:
def process_text(document,stopwords=STOPWORDS):
    nlp = spacy.load("en_core_web_sm")
    document = re.sub(r'\s+', ' ', document, flags=re.I) 
    document = re.sub(r'\W', ' ', str(document))
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document) 
    document = document.lower() 
    doc_spacy=nlp(document)
    lemma_txt = ["".join(token.lemma_) for token in tqdm.tqdm(doc_spacy)]
    lemma_no_stop_txt = [word for word in lemma_txt if word not in stopwords]
    lemma_no_stop_txt = [word for word in lemma_no_stop_txt if len(word) > 3]
    clean_txt = ' '.join(lemma_no_stop_txt)
    return clean_txt

x_train_preprocessed= x_train

In [None]:
x_train_preprocessed

In [None]:
x_train_preprocessed.to_csv('/kaggle/working/x_train_preprocessed.csv')


In [None]:
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(x_train_preprocessed)  # Ensure consistent preprocessing
X_train_sequences = tokenizer.texts_to_sequences(x_train_preprocessed)
X_test_sequences = tokenizer.texts_to_sequences(x_test)
max_sequence_length = max([len(seq) for seq in X_train_sequences])
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length)
pd.DataFrame({'words':tokenizer.sequences_to_texts(list(X_train_sequences)),'sequances':X_train_sequences})

In [None]:
y_train

In [None]:
len(tokenizer.word_index)

In [None]:
model = Sequential([
Embedding(len(tokenizer.word_index)+1, 64),
Conv1D(64, 3, activation='relu'),
MaxPooling1D(2,2),
Conv1D(32,3, activation='relu'),
MaxPooling1D(2,2),
Flatten(),
Dense(8, activation='relu'),
Dense(1, activation='sigmoid'),]
)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
Early_Stopping=tensorflow.keras.callbacks.EarlyStopping(patience=5,verbose=1)
history=model.fit(X_train_padded,y_train, epochs=10,batch_size=128,callbacks=[Early_Stopping])

In [None]:
test_history=model.evaluate(X_test_padded,y_test)

In [None]:
model.summary()

In [None]:
X_train_padded

In [None]:
def predict_sentiment(input_text, model, tokenizer,max_sequence_length=max_sequence_length):
    classes = ['Negative', 'Positive']
    x_test_sequences = tokenizer.texts_to_sequences(input_text)
    X_test_padded = pad_sequences(x_test_sequences, maxlen=max_sequence_length, truncating='pre')
    probabilities = model.predict(X_test_padded)
    predictions = classes[int(np.round([i for i in probabilities]))]
    return predictions

input_text = ['Does feel good']
predictions = predict_sentiment(input_text, model, tokenizer)
print(predictions)

In [None]:
 predict_sentiment(['ifeel that i didnt drink any coffe from 10 months'], model, tokenizer)

In [None]:
 predict_sentiment(['ifeel that drink any coffe ,iam happy and have big contractions '], model, tokenizer)

In [None]:
import matplotlib.pyplot as plt

def plot_training_history(history):

    train_accuracy = history.history['accuracy']
    train_loss = history.history['loss']

    plt.figure(figsize=(10, 4))
    plt.subplot(1, 2, 1)
    plt.plot(range(1, len(train_accuracy) + 1), train_accuracy, label='Training Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.title('Training Accuracy')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(range(1, len(train_loss) + 1), train_loss, label='Training Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training Loss')
    plt.legend()

    plt.tight_layout()
    plt.show()

plot_training_history(history)