<a href="https://colab.research.google.com/github/Asaedd/TaskNLP/blob/main/TaskNLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os

folder_path = "/content/drive/My Drive/Sentimental/Sentimental"
print(os.listdir(folder_path))


['twitter_validation.csv', 'twitter_training.csv']


In [None]:
import pandas as pd

# Load CSV files without headers
train_df = pd.read_csv(f"{folder_path}/twitter_training.csv", encoding='latin-1', header=None)
val_df = pd.read_csv(f"{folder_path}/twitter_validation.csv", encoding='latin-1', header=None)

# Rename columns
train_df.columns = ['ID', 'Topic', 'Sentiment', 'Text']
val_df.columns = ['ID', 'Topic', 'Sentiment', 'Text']

# Drop the ID and Topic columns (if not needed)
train_df = train_df[['Sentiment', 'Text']]
val_df = val_df[['Sentiment', 'Text']]

print(train_df.head())

  Sentiment                                               Text
0  Positive  im getting on borderlands and i will murder yo...
1  Positive  I am coming to the borders and I will kill you...
2  Positive  im getting on borderlands and i will kill you ...
3  Positive  im coming on borderlands and i will murder you...
4  Positive  im getting on borderlands 2 and i will murder ...


In [None]:
import re

def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation
    text = text.lower().strip()  # Convert to lowercase
    return text

train_df['cleaned_text'] = train_df['Text'].astype(str).apply(clean_text)
val_df['cleaned_text'] = val_df['Text'].astype(str).apply(clean_text)


In [None]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

train_df['cleaned_text'] = train_df['cleaned_text'].apply(remove_stopwords)
val_df['cleaned_text'] = val_df['cleaned_text'].apply(remove_stopwords)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

train_df['cleaned_text'] = train_df['cleaned_text'].apply(lemmatize_text)
val_df['cleaned_text'] = val_df['cleaned_text'].apply(lemmatize_text)


[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Set tokenizer parameters
max_features = 5000  # Vocabulary size
max_length = 100  # Max sequence length

# Tokenizer
tokenizer = Tokenizer(num_words=max_features, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df['cleaned_text'])

# Convert text to sequences
X_train = tokenizer.texts_to_sequences(train_df['cleaned_text'])
X_val = tokenizer.texts_to_sequences(val_df['cleaned_text'])

# Pad sequences to ensure uniform input length
X_train = pad_sequences(X_train, maxlen=max_length, padding='post', truncating='post')
X_val = pad_sequences(X_val, maxlen=max_length, padding='post', truncating='post')


In [None]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Encode labels to numerical values
label_encoder = LabelEncoder()
train_df['Sentiment'] = label_encoder.fit_transform(train_df['Sentiment'])
val_df['Sentiment'] = label_encoder.transform(val_df['Sentiment'])

# Convert labels to categorical (one-hot encoding)
y_train = to_categorical(train_df['Sentiment'])
y_val = to_categorical(val_df['Sentiment'])



In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

# Get the number of classes
num_classes = y_train.shape[1]  # Should be 4 if you have Positive, Negative, Neutral, Irrelevant

# Define the RNN model
model = Sequential([
    Embedding(input_dim=max_features, output_dim=128, input_length=max_length),
    SimpleRNN(128, return_sequences=False),
    Dense(num_classes, activation='softmax')  # Change activation to softmax for multi-class classification
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Print model summary
#model.summary()




In [None]:
batch_size = 32
epochs = 5

history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val, y_val))


Epoch 1/5
[1m2334/2334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 52ms/step - accuracy: 0.2860 - loss: 1.3834 - val_accuracy: 0.2780 - val_loss: 1.3733
Epoch 2/5
[1m2334/2334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 50ms/step - accuracy: 0.2937 - loss: 1.3796 - val_accuracy: 0.2880 - val_loss: 1.3719
Epoch 3/5
[1m2334/2334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 50ms/step - accuracy: 0.3047 - loss: 1.3708 - val_accuracy: 0.2930 - val_loss: 1.3847
Epoch 4/5
[1m2334/2334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 51ms/step - accuracy: 0.3087 - loss: 1.3680 - val_accuracy: 0.2890 - val_loss: 1.3823
Epoch 5/5
[1m2334/2334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 50ms/step - accuracy: 0.3048 - loss: 1.3712 - val_accuracy: 0.2700 - val_loss: 1.3966
