# Classification of twitter water events
### - using deep neural networks

Author: Fadoua Ghourabi (fadouaghourabi@gmail.com)

Date: July 17, 2019

In [5]:
import os
import time
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import losses
from ipynb.fs.full.fr_twitter_water_classification_ML import train_test_datasets
from ipynb.fs.full.fr_twitter_water_datasets import tweet_avg_w2v, tweet_avg_w2v_tfidf, tweet_d2v, tweet_avg_ft
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

## Water corpus

### Strategy 1: Averaging word2vectors

In [6]:
vecs, labels = pd.DataFrame(tweet_avg_w2v["TwVec"].values.tolist()), tweet_avg_w2v["Event"]
X, y, X_train, X_test, y_train, y_test = train_test_datasets(vecs,labels,stratify=True,random=43)

In [7]:
model = tf.keras.Sequential([keras.layers.Dense(units=5, input_shape=[50], activation=tf.nn.relu),
                             keras.layers.Dense(units=1, input_shape=[5], activation=tf.nn.softmax)])
model.compile(optimizer='sgd', loss=losses.categorical_crossentropy)

W0718 10:28:03.680335 140735831102336 deprecation.py:506] From /anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0718 10:28:03.771684 140735831102336 deprecation_wrapper.py:119] From /anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3295: The name tf.log is deprecated. Please use tf.math.log instead.



In [8]:
model.fit(X_train, y_train, epochs=10)

W0718 10:28:03.860865 140735831102336 deprecation.py:323] From /anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1c2d55ce10>

In [9]:
model.evaluate(X_train, y_train),model.evaluate(X_test, y_test)



(2.9728002386170778e-08, 3.0247133169601214e-08)

In [10]:
y_pred = model.predict(X_test)

In [11]:
confusion_matrix(y_test, y_pred)

array([[  0, 100],
       [  0,  34]])

### Strategy 2: Averaging word2vectors with TF-IDF

In [12]:
vecs, labels = pd.DataFrame(tweet_avg_w2v_tfidf["TwVec"].values.tolist()), tweet_avg_w2v_tfidf["Event"]
X, y, X_train, X_test, y_train, y_test = train_test_datasets(vecs,labels,stratify=True,random=43)

In [13]:
model_tfidf = tf.keras.Sequential([keras.layers.Dense(units=5, input_shape=[50], activation=tf.nn.relu),
                                   keras.layers.Dense(units=1, input_shape=[5], activation=tf.nn.softmax)])
model_tfidf.compile(optimizer='sgd', loss=losses.categorical_crossentropy)

In [14]:
model_tfidf.fit(X_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1c2d6c0630>

In [15]:
model.evaluate(X_train, y_train),model.evaluate(X_test, y_test)



(2.9728002386170778e-08, 3.0247133169601214e-08)

In [16]:
y_pred = model.predict(X_test)

In [17]:
confusion_matrix(y_test, y_pred)

array([[  0, 100],
       [  0,  34]])

### Strategy 3: doc2vec

In [18]:
vecs, labels = pd.DataFrame(tweet_d2v["TwVec"].values.tolist()), tweet_d2v["Event"]
X, y, X_train, X_test, y_train, y_test = train_test_datasets(vecs, labels, stratify=True, random=43)

In [19]:
model_d2v = tf.keras.Sequential([keras.layers.Dense(units=5, input_shape=[300], activation=tf.nn.relu),
                                 keras.layers.Dense(units=5, input_shape=[5], activation=tf.nn.relu),
                                 keras.layers.Dense(units=5, input_shape=[5], activation=tf.nn.relu),
                                 keras.layers.Dense(units=1, input_shape=[5], activation=tf.nn.softmax)])
model_d2v.compile(optimizer='sgd', loss=losses.categorical_crossentropy)

In [20]:
model_d2v.fit(X_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1c143e5630>

In [21]:
model_d2v.evaluate(X_train, y_train),model_d2v.evaluate(X_test, y_test)



(2.9728002386170778e-08, 3.0247133169601214e-08)

In [22]:
y_pred = model_d2v.predict(X_test)

In [23]:
confusion_matrix(y_test, y_pred)

array([[  0, 100],
       [  0,  34]])

## FastText corpus

In [24]:
vecs, labels = pd.DataFrame(tweet_avg_ft["TwVec"].values.tolist()), tweet_avg_ft["Event"]
X, y, X_train, X_test, y_train, y_test = train_test_datasets(vecs,labels,stratify=True,random=43)

In [25]:
model_ft = tf.keras.Sequential([keras.layers.Dense(units=10, input_shape=[300], activation=tf.nn.relu),
                                keras.layers.Dense(units=1, input_shape=[10], activation=tf.nn.softmax)])
model_ft.compile(optimizer='sgd', loss=losses.categorical_crossentropy)

In [26]:
model_ft.fit(X_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1a589ef438>

In [27]:
model_ft.evaluate(X_train, y_train),model_ft.evaluate(X_test, y_test)



(2.9728002386170778e-08, 3.0247133169601214e-08)

In [28]:
y_pred = model_ft.predict(X_test)

In [29]:
confusion_matrix(y_test, y_pred)

array([[  0, 100],
       [  0,  34]])