## Mouinting Drive

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from keras.datasets import imdb
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras import Sequential
from keras.layers import Dense,SimpleRNN,Embedding,Flatten
from keras.losses import  CategoricalCrossentropy

## Reading CSV

[Dataset](https://www.kaggle.com/datasets/jp797498e/twitter-entity-sentiment-analysis)

In [3]:
df1 = pd.read_csv('twitter_training.csv')
df2 = pd.read_csv('twitter_validation.csv')
df = pd.concat([df1, df2], axis=0)
df

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,",3364,Facebook,Irrelevant,"I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom‚Äôs great auntie as ‚ÄòHayley can‚Äôt get out of bed‚Äô and told to his grandma, who now thinks I‚Äôm a lazy, terrible person ü§£"
0,2401.0,Borderlands,Positive,I am coming to the borders and I will kill you...,,,,
1,2401.0,Borderlands,Positive,im getting on borderlands and i will kill you ...,,,,
2,2401.0,Borderlands,Positive,im coming on borderlands and i will murder you...,,,,
3,2401.0,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,,,,
4,2401.0,Borderlands,Positive,im getting into borderlands and i can murder y...,,,,
...,...,...,...,...,...,...,...,...
994,,,,,4891.0,GrandTheftAuto(GTA),Irrelevant,‚≠êÔ∏è Toronto is the arts and culture capital of ...
995,,,,,4359.0,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
996,,,,,2652.0,Borderlands,Positive,Today sucked so it‚Äôs time to drink wine n play...
997,,,,,8069.0,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.


In [4]:
words = df.iloc[:,3]
words = np.array(words, 'str')

In [5]:
words

array(['I am coming to the borders and I will kill you all,',
       'im getting on borderlands and i will kill you all,',
       'im coming on borderlands and i will murder you all,', ..., 'nan',
       'nan', 'nan'], dtype='<U957')

## Splitting the Data

In [6]:
train_X = df1.iloc[:,3]
test_X = df2.iloc[:,3]
train_y = df1.iloc[:,2]
test_y = df2.iloc[:,2]

In [7]:
train_X.head()

0    I am coming to the borders and I will kill you...
1    im getting on borderlands and i will kill you ...
2    im coming on borderlands and i will murder you...
3    im getting on borderlands 2 and i will murder ...
4    im getting into borderlands and i can murder y...
Name: im getting on borderlands and i will murder you all ,, dtype: object

In [8]:
test_y.head()

0     Neutral
1    Negative
2    Negative
3     Neutral
4    Negative
Name: Irrelevant, dtype: object

In [9]:
train_X = np.array(train_X, 'str')
test_X = np.array(test_X, 'str')

## Count number of words

In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(words)
len(tokenizer.word_index)

33784

In [11]:
train_X = tokenizer.texts_to_sequences(train_X)
test_X = tokenizer.texts_to_sequences(test_X)
test_X[0]

[1828,
 189,
 87,
 1773,
 1951,
 2980,
 5620,
 1688,
 348,
 5456,
 30,
 5,
 22508,
 24527,
 1828,
 75,
 342,
 189,
 6523,
 24528]

## Use encoding technique on Y axis

In [12]:
label_encoder = preprocessing.LabelEncoder()
test_y = label_encoder.fit_transform(test_y)
train_y = label_encoder.fit_transform(train_y)

## Padding to make every sentence with equal length

In [14]:
train_X = pad_sequences(train_X,padding='post',maxlen=50)
test_X = pad_sequences(test_X,padding='post',maxlen=50)

In [15]:
train_X.shape

(74681, 50)

In [16]:
train_X[12]

array([ 1680,   345,  1681, 18783,  3319,   904,  1923,  7813,   141,
          71,    91,   581,    11, 10157,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0], dtype=int32)

In [18]:
test_X[0]

array([ 1828,   189,    87,  1773,  1951,  2980,  5620,  1688,   348,
        5456,    30,     5, 22508, 24527,  1828,    75,   342,   189,
        6523, 24528,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0], dtype=int32)

In [17]:
model = Sequential()
model.add(Embedding(33784, 4))
model.add(SimpleRNN(32,return_sequences=False))
model.add(Dense(1, activation='softmax'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 4)           135136    
                                                                 
 simple_rnn (SimpleRNN)      (None, 32)                1184      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 136353 (532.63 KB)
Trainable params: 136353 (532.63 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [19]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
history = model.fit(train_X, train_y, epochs=5, validation_data=(test_X,test_y))

Epoch 1/5


  return dispatch_target(*args, **kwargs)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
