# Twitter Sentiment Analysis

## 1. Import necessary libraries:

In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [2]:
# use this to stretch the dataframe view
pd.set_option('display.max_colwidth', None)

## 2. Prepare the data:

The keyword and location columns are not needed, so we remove them

In [3]:
root_folder = '/kaggle/input/nlp-getting-started/'

In [4]:
df_train = pd.read_csv(root_folder+'train.csv', sep=',')
df_test = pd.read_csv(root_folder+'test.csv', sep=',')

In [5]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1


In [6]:
df_train = df_train.drop(['keyword','location'], axis=1)
df_test = df_test.drop(['keyword','location'], axis=1)

In [7]:
df_train.head()

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,"13,000 people receive #wildfires evacuation orders in California",1
4,7,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1


## 3. Setting the dataset variables:

In [8]:
x = df_train.sort_values(by= ['id'], ascending=True)
x_test = df_test.sort_values(by= ['id'], ascending=True)

In [9]:
#Retrieve the text from the dataframe as a numpy array
twitts = x.loc[:,x.columns=='text'].values.flatten()
twitts_test = x_test.loc[:,x_test.columns=='text'].values.flatten()

In [10]:
# see a sample of the collected twitts:
print(twitts[:10])

['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'
 'Forest fire near La Ronge Sask. Canada'
 "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected"
 '13,000 people receive #wildfires evacuation orders in California '
 'Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school '
 '#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires'
 '#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas'
 "I'm on top of the hill and I can see a fire in the woods..."
 "There's an emergency evacuation happening now in the building across the street"
 "I'm afraid that the tornado is coming to our area..."]


## 4. Applying NLP:
We need to use the usual tokenization techniques in order to translate the text data to numerical vectors that we can feed to a neural network.

In [11]:
t = Tokenizer()
def preprocess(text):
    seqs = t.fit_on_texts(text)
    return seqs

In [12]:
#fit tokenizer on training set:
tokens=preprocess(twitts)

In [13]:
#text to sequences
X_train = t.texts_to_sequences(twitts)
X_test = t.texts_to_sequences(twitts_test)

In [14]:
#pad sequences
X_train = np.array(X_train, dtype=object)
X_test = np.array(X_test, dtype=object)
X_train = keras.preprocessing.sequence.pad_sequences(X_train)
X_test = keras.preprocessing.sequence.pad_sequences(X_test)

In [15]:
# this is a single encoded twitt:
print(X_train[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0  119 4633   24    4  868    8   21  263
  138 1619 4634   89   40]


In [16]:
#size of the vocabulary
vocab_size = len(t.word_index)
print('Size of vocabulary:', vocab_size)

Size of vocabulary: 22700


In [17]:
# target training variable:
y_train = df_train.target.values

In [18]:
# the y variable represents the labels (1 for true disaster, 0 otherwise):
print(y_train[0])

1


## 5. Training the model RNN:

In [19]:
#create the model
model = Sequential()
model.add(Embedding(input_dim = vocab_size+2, output_dim = 32, name='Embedding'))
model.add(LSTM(64)) 
model.add(Dropout(0.5))
model.add(Dense(64, activation = 'relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss="binary_crossentropy", metrics=["accuracy"])
model.summary()


User settings:

   KMP_AFFINITY=granularity=fine,verbose,compact,1,0
   KMP_BLOCKTIME=0
   KMP_SETTINGS=1

Effective settings:

   KMP_ABORT_DELAY=0
   KMP_ADAPTIVE_LOCK_PROPS='1,1024'
   KMP_ALIGN_ALLOC=64
   KMP_ALL_THREADPRIVATE=128
   KMP_ATOMIC_MODE=2
   KMP_BLOCKTIME=0
   KMP_CPUINFO_FILE: value is not defined
   KMP_DETERMINISTIC_REDUCTION=false
   KMP_DEVICE_THREAD_LIMIT=2147483647
   KMP_DISP_NUM_BUFFERS=7
   KMP_DUPLICATE_LIB_OK=false
   KMP_ENABLE_TASK_THROTTLING=true
   KMP_FORCE_REDUCTION: value is not defined
   KMP_FOREIGN_THREADS_THREADPRIVATE=true
   KMP_FORKJOIN_BARRIER='2,2'
   KMP_FORKJOIN_BARRIER_PATTERN='hyper,hyper'
   KMP_GTID_MODE=3
   KMP_HANDLE_SIGNALS=false
   KMP_HOT_TEAMS_MAX_LEVEL=1
   KMP_HOT_TEAMS_MODE=0
   KMP_INIT_AT_FORK=true
   KMP_LIBRARY=throughput
   KMP_LOCK_KIND=queuing
   KMP_MALLOC_POOL_INCR=1M
   KMP_NUM_LOCKS_IN_BLOCK=1
   KMP_PLAIN_BARRIER='2,2'
   KMP_PLAIN_BARRIER_PATTERN='hyper,hyper'
   KMP_REDUCTION_BARRIER='1,1'
   KMP_REDUCTION_BAR

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Embedding (Embedding)        (None, None, 32)          726464    
_________________________________________________________________
lstm (LSTM)                  (None, 64)                24832     
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 64)                4160      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 755,521
Trainable params: 755,521
Non-trainable params: 0
_________________________________________________________________


In [20]:
history = model.fit(X_train, y_train, shuffle=True, epochs=10, batch_size=64)

Epoch 1/10


2022-01-06 01:47:06.053583: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [21]:
train_score = model.evaluate(X_train, y_train)



## 6. Making predictions:
The results are collected in a dataframe with the twitts and the corresponding predicted labels

In [22]:
preds = np.where(model.predict(X_test)>0.5,1.0,0.0).flatten().astype('int32')

In [23]:
results = {'text':twitts_test, 'target':preds}
df_results = pd.DataFrame(results)
df_results.head()

Unnamed: 0,text,target
0,Just happened a terrible car crash,1
1,"Heard about #earthquake is different cities, stay safe everyone.",0
2,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all",1
3,Apocalypse lighting. #Spokane #wildfires,1
4,Typhoon Soudelor kills 28 in China and Taiwan,1


## 7. Compiling the submission file:
We need to to a bit of manipulations since the submission file must only contain the given twitt id with its corresponding prediction

In [24]:
submission = pd.merge(df_test, df_results, on='text').drop_duplicates()
submission.drop(['text'],axis=1).to_csv('submission.csv', index=False)