<a href="https://colab.research.google.com/github/Ahmedabdelwaly/sentiment-analysis-nlp/blob/main/NlpTask.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Data Loading

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import tensorflow as tf
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from sklearn.model_selection import train_test_split

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os

folder_path = "/content/drive/MyDrive/Sentimental/Sentimental"
if os.path.exists(folder_path):
    print("Nested folder exists. Listing files:")
    print(os.listdir(folder_path))
else:
    print("Nested folder not found! Check the folder structure in Google Drive.")

Nested folder exists. Listing files:
['training.csv']


In [4]:
file_path = "/content/drive/MyDrive/Sentimental/Sentimental/training.csv"  # Adjust if needed
df = pd.read_csv(file_path, encoding='latin-1', usecols=[0, 5], names=['Sentiment', 'Text'])

**Preprocessing**

In [5]:
df['Sentiment'] = df['Sentiment'].replace({0: 0, 4: 1})

In [6]:
from nltk.stem import WordNetLemmatizer

In [7]:
# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [8]:
contractions = {
    "can't": "cannot", "won't": "will not", "i'm": "i am", "he's": "he is", "she's": "she is",
    "it's": "it is", "that's": "that is", "what's": "what is", "where's": "where is", "who's": "who is",
    "n't": " not", "'re": " are", "'ve": " have", "'ll": " will", "'d": " would"
}

In [11]:
from imblearn.over_sampling import RandomOverSampler

def expand_contractions(text):
    for key, value in contractions.items():
        text = text.replace(key, value)
    return text

# Improved Text Cleaning Function
def clean_text(text):
    text = text.lower()
    text = expand_contractions(text)  # Expand contractions
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)  # Remove mentions
    text = re.sub(r'#[A-Za-z0-9_]+', '', text)  # Remove hashtags
    text = re.sub(r'[^a-zA-Z ]', '', text)  # Remove non-alphabetic characters
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words and len(word) > 2])  # Lemmatization & stopwords removal
    return text

df['cleaned_text'] = df['Text'].astype(str).apply(clean_text)

# Balance the dataset using oversampling
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(df[['cleaned_text']], df['Sentiment'])
df_balanced = pd.DataFrame({'cleaned_text': X_resampled['cleaned_text'], 'Sentiment': y_resampled})

In [12]:
max_words = 10000
max_length = 50
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(df_balanced['cleaned_text'])
sequences = tokenizer.texts_to_sequences(df_balanced['cleaned_text'])
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df_balanced['Sentiment'], test_size=0.2, random_state=42)

#Rnn Model

In [13]:
model = Sequential([
    Embedding(max_words, 128, input_length=max_length),
    SimpleRNN(64, return_sequences=False),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model
epochs = 5
batch_size = 64
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))



Epoch 1/5
[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m727s[0m 36ms/step - accuracy: 0.6611 - loss: 0.6205 - val_accuracy: 0.7234 - val_loss: 0.5613
Epoch 2/5
[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m695s[0m 34ms/step - accuracy: 0.7278 - loss: 0.5605 - val_accuracy: 0.6844 - val_loss: 0.5924
Epoch 3/5
[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m682s[0m 34ms/step - accuracy: 0.7006 - loss: 0.5822 - val_accuracy: 0.5016 - val_loss: 0.6931
Epoch 4/5
[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m677s[0m 34ms/step - accuracy: 0.6169 - loss: 0.6516 - val_accuracy: 0.6911 - val_loss: 0.5955
Epoch 5/5
[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m687s[0m 34ms/step - accuracy: 0.6979 - loss: 0.5873 - val_accuracy: 0.7141 - val_loss: 0.5714


In [14]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Accuracy: {accuracy * 100:.2f}%')

[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 7ms/step - accuracy: 0.7137 - loss: 0.5720
Accuracy: 71.41%
