<a href="https://colab.research.google.com/github/B1aCkManTa/jetbrains-sentiment-task/blob/main/task_jetbrains.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing necessary libraries



In [1]:
!pip install num2words

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import nltk
from nltk.corpus import stopwords

import inflect

import pandas as pd

import gspread
from textblob import Word
from google.colab import auth
from google.auth import default

from keras.optimizers import Adam
from keras.models import Sequential
from keras.utils import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Dropout

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score


In [3]:
# Download the stopwords and wordnet corpus from NLTK
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Loading the dataset

In [4]:
url = 'https://raw.githubusercontent.com/B1aCkManTa/jetbrains-sentiment-task/main/fb_sentiment.csv'
df = pd.read_csv(url)

df = df.iloc[:,1:]      # Removes the first column of the DataFrame
df.columns = ['FBPost', 'Label']  # Sets the column names of the DataFrame to 'FBPost' and 'Label'

In [5]:
df

Unnamed: 0,FBPost,Label
0,Drug Runners and a U.S. Senator have somethin...,O
1,"Heres a single, to add, to Kindle. Just read t...",O
2,If you tire of Non-Fiction.. Check out http://...,O
3,Ghost of Round Island is supposedly nonfiction.,O
4,Why is Barnes and Nobles version of the Kindle...,N
...,...,...
995,I liked it. Its youth oriented and I think th...,P
996,"I think the point of the commercial is that, e...",P
997,Kindle 3 is such a great product. I could not ...,P
998,develop a way to share books! that is a big d...,N


## Pre-Processing the text 

In [6]:
# Define a function to clean the text data
from num2words import num2words

def replace_numbers_with_words(text):
    words = text.split()
    for i, word in enumerate(words):
        if word.isnumeric():
            words[i] = num2words(int(word))
    return ' '.join(words)
    
def cleaning(df, column, stop_words):
    # Convert the text to lowercase
    df[column] = df[column].apply(lambda x: ' '.join(x.lower() for x in x.split()))
    # Replace numbers with words
    df[column] = df[column].apply(replace_numbers_with_words)
    # Remove stop words
    df[column] = df[column].apply(lambda x: ' '.join(x for x in x.split() if x not in stop_words))
    # Lemmatize the text
    df[column] = df[column].apply(lambda x: ' '.join([Word(x).lemmatize() for x in x.split()]))
    # Return the cleaned DataFrame
    return df


# Get the list of English stop words
stop_words = stopwords.words('english')

# Print the list of stop words
print(stop_words)


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [7]:
# Clean the text data using the cleaning function
data_cleaned = cleaning(df, "FBPost", stop_words)
data_cleaned

Unnamed: 0,FBPost,Label
0,drug runner u.s. senator something murder http...,O
1,"here single, add, kindle. read 19th century st...",O
2,tire non-fiction.. check http://www.amazon.com...,O
3,ghost round island supposedly nonfiction.,O
4,barnes noble version kindle much expensive kin...,N
...,...,...
995,liked it. youth oriented think widen appeal.,P
996,"think point commercial that, even border closi...",P
997,kindle three great product. could happier mine...,P
998,develop way share books! big drawback. love ki...,N


## Generating Embeddings using tokenizer

In [8]:
# Define the maximum number of words to keep
num_words = 500

# Define the tokenizer
tokenizer = Tokenizer(num_words=num_words, split=' ')

# Fit the tokenizer on the text data
tokenizer.fit_on_texts(data_cleaned['FBPost'].values)

# Convert the text data to sequences of integers
X = tokenizer.texts_to_sequences(data_cleaned['FBPost'].values)

# Pad the sequences to have the same length
X = pad_sequences(X)

# Perform one-hot encoding on the 'Label' column
one_hot = pd.get_dummies(data_cleaned['Label'])


In [9]:
print(X.shape)

X

(1000, 92)


array([[  0,   0,   0, ...,   9, 240, 165],
       [  0,   0,   0, ..., 243, 335,  94],
       [  0,   0,   0, ..., 419, 420, 420],
       ...,
       [  0,   0,   0, ..., 405,  45,  28],
       [  0,   0,   0, ...,  32, 308,   3],
       [  0,   0,   0, ...,   0,   2,   1]], dtype=int32)

In [10]:
print(one_hot.shape)

one_hot

(1000, 3)


Unnamed: 0,N,O,P
0,0,1,0
1,0,1,0
2,0,1,0
3,0,1,0
4,1,0,0
...,...,...,...
995,0,0,1
996,0,0,1
997,0,0,1
998,1,0,0


## Splitting the data into trainig and testing

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X, one_hot, test_size=0.2, random_state=5)

print(X_train)
print(Y_train)
print(X_train.shape)
print(Y_train.shape)

[[  0   0   0 ... 181 222 253]
 [  0   0   0 ...  39  61  11]
 [  0   0   0 ...  29  93 388]
 ...
 [  0   0   0 ...   0   2   1]
 [  0   0   0 ...  54 425 160]
 [  0   0   0 ... 180 411 311]]
     N  O  P
40   0  0  1
977  0  0  1
829  0  0  1
973  0  0  1
874  1  0  0
..  .. .. ..
400  1  0  0
118  0  0  1
701  0  0  1
206  0  0  1
867  1  0  0

[800 rows x 3 columns]
(800, 92)
(800, 3)


## Model Building

In [12]:
# Create a sequential model
model = Sequential()

# Add an embedding layer with input size 500, output size 120, and input length equal to the number of columns in the X array
model.add(Embedding(500, 120, input_length=X.shape[1]))

# Add a 1D spatial dropout layer with a rate of 0.4
model.add(SpatialDropout1D(0.4))

# Add a LSTM layer with 256 units, dropout of 0.2, recurrent dropout of 0.2, and return sequences set to True
model.add(LSTM(256, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))

# Add a second LSTM layer with 128 units, dropout of 0.2, and recurrent dropout of 0.2
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))

# Add a dense layer with 64 units and ReLU activation function
model.add(Dense(64, activation='relu'))

# Add a dropout layer with a rate of 0.4
model.add(Dropout(0.4))

# Add a dense layer with 3 units and softmax activation function
model.add(Dense(3, activation='softmax'))

# Use a custom optimizer with a learning rate of 0.001
adam = Adam(learning_rate=0.001)

# Compile the model with categorical crossentropy loss function, the custom optimizer, and accuracy metric
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 92, 120)           60000     
                                                                 
 spatial_dropout1d (SpatialD  (None, 92, 120)          0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 92, 256)           386048    
                                                                 
 lstm_1 (LSTM)               (None, 128)               197120    
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                        

## Model Training

In [13]:
# Use k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True)
cnt = 0
for train_index, val_index in kf.split(X_train):
  print(f'Fold number {cnt}')
  cnt+=1
  X_fold_train, X_fold_val = X_train[train_index], X_train[val_index]
  y_fold_train, y_fold_val = Y_train.values[train_index], Y_train.values[val_index]
  model.fit(X_fold_train, y_fold_train, epochs=10, batch_size=32, validation_data=(X_fold_val, y_fold_val), verbose=1)
  print()

Fold number 0
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Fold number 1
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Fold number 2
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Fold number 3
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Fold number 4
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



## Model evaluation

In [14]:
model.evaluate(X_train, Y_train)
model.evaluate(X_test, Y_test)



[1.9540480375289917, 0.7350000143051147]