In [1]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [2]:
train_df = pd.read_csv('/Users/azaruddin.sk/Desktop/YUbi/NLP Task/train.tsv', sep='\t')
test_df = pd.read_csv('/Users/azaruddin.sk/Desktop/YUbi/NLP Task/test.tsv', sep='\t')

#print(train_df.isnull().sum())
#print(test_df.isnull().sum())

test_df = test_df.drop(columns=['Sector'])
train_df['News Title'].fillna('', inplace=True)
train_df['News Description'].fillna('', inplace=True)
test_df['News Title'].fillna('', inplace=True)
test_df['News Description'].fillna('', inplace=True)

train_df['text'] = train_df['News Title'] + " " + train_df['News Description']
test_df['text'] = test_df['News Title'] + " " + test_df['News Description']

In [3]:
from nltk.tokenize import RegexpTokenizer  #tokenizer library regexptokenizer to remove punctuation
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
stop_words = list(set(nltk.corpus.stopwords.words('english'))) #decalred stop_words as variable for getting stopwords from english langaugae & added set() function to remove duplicate stop_words
lemmatizer = WordNetLemmatizer()

def text_preprocess(input_text):
    if not isinstance(input_text, str):
        return ""
    input_text = input_text.lower()
    input_text = input_text.replace("\n"," ")
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input_text)
    clean_tokens = []
    for token in tokens:
        if (token not in stop_words) and (len(token) >2):
            token = lemmatizer.lemmatize(token)
            clean_tokens.append(token)
    return " ".join(clean_tokens)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/azaruddin.sk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/azaruddin.sk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
clean_text = []
for text in train_df['text'].values:
    cl_txt = text_preprocess(text)
    clean_text.append(cl_txt)
train_df['clean_text'] = clean_text


pre_text = []
for text in test_df['text'].values:
    cl_txt = text_preprocess(text)
    pre_text.append(cl_txt)
test_df['clean_text'] = pre_text

In [14]:
label_encoder = LabelEncoder()
train_df['Sentiment'] = label_encoder.fit_transform(train_df['Sentiment'])

In [6]:
X_train, X_val, y_train, y_val = train_test_split(train_df['clean_text'], train_df['Sentiment'], test_size=0.2, random_state=42)


max_words = 10000  # Number of words to consider as features
max_len = 150  # Maximum length of sequences
tokenizer = Tokenizer(num_words=max_words)

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(test_df['clean_text'])

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

In [7]:
model = Sequential()
model.add(Embedding(max_words, 128, input_length=max_len))  
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(64))
model.add(Dense(4, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
history = model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_val_pad, y_val))

Epoch 1/5




[1m4544/4544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m711s[0m 156ms/step - accuracy: 0.7837 - loss: 0.5413 - val_accuracy: 0.8280 - val_loss: 0.4140
Epoch 2/5
[1m4544/4544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m942s[0m 207ms/step - accuracy: 0.8444 - loss: 0.3749 - val_accuracy: 0.8272 - val_loss: 0.4165
Epoch 3/5
[1m4544/4544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m909s[0m 200ms/step - accuracy: 0.8640 - loss: 0.3286 - val_accuracy: 0.8308 - val_loss: 0.4105
Epoch 4/5
[1m4544/4544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m775s[0m 170ms/step - accuracy: 0.8831 - loss: 0.2843 - val_accuracy: 0.8286 - val_loss: 0.4442
Epoch 5/5
[1m4544/4544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m785s[0m 173ms/step - accuracy: 0.9034 - loss: 0.2385 - val_accuracy: 0.8245 - val_loss: 0.4866


In [8]:
y_test_pred = model.predict(X_test_pad)
y_test_pred_labels = np.argmax(y_test_pred, axis=1)
test_df['Sentiment'] = label_encoder.inverse_transform(y_test_pred_labels)

[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 43ms/step


In [23]:
submission = pd.DataFrame({
    'News ID': test_df['News ID'],
    'Sentiment': test_df['Sentiment']
})

file_path = '/Users/azaruddin.sk/Desktop/YUbi/NLP Task/submission_LSTM.csv'

submission.to_csv(file_path, index=False)