# Classification of twitter water events
### - using deep neural networks

Author: Fadoua Ghourabi (fadouaghourabi@gmail.com)

Date: version @ July 17, 2019 

In [None]:
import os
import time
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import losses
from sklearn.metrics import accuracy_score
from ipynb.fs.full.fr_twitter_water_classification_ML import train_test_datasets
from ipynb.fs.full.fr_twitter_water_datasets import tweet_avg_w2v, tweet_avg_w2v_tfidf, tweet_d2v, tweet_avg_ft
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

## Water corpus

### Strategy 1: Averaging word2vectors

In [None]:
vecs, labels = pd.DataFrame(tweet_avg_w2v["TwVec"].values.tolist()), tweet_avg_w2v["Event"]
X, y, X_train, X_test, y_train, y_test = train_test_datasets(vecs,labels,stratify=True,random=43)

In [None]:
model = tf.keras.Sequential([keras.layers.Dense(units=5, input_shape=[50], activation=tf.nn.tanh),
                             keras.layers.Dense(units=5, input_shape=[5], activation=tf.nn.tanh),
                             keras.layers.Dense(units=5, input_shape=[5], activation=tf.nn.tanh),
                             keras.layers.Dense(units=1, input_shape=[5], activation=tf.nn.sigmoid)])
#model.compile(optimizer='sgd', loss=losses.categorical_crossentropy)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
model.fit(X_train, y_train, epochs=10)

In [None]:
model.evaluate(X_train, y_train),model.evaluate(X_test, y_test)

In [None]:
def y_class(threshold, y, class1,class2):
    y_class = []
    for t in y:
        if t < threshold:
            y_class.append(class1)
        else:
            y_class.append(class2)
    return y_class

In [None]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
y_train_pred_class = model.predict_classes(X_train) #y_class(0.5, y_train_pred, 0,1)
y_test_pred_class = model.predict_classes(X_test) #y_class(0.5, y_test_pred, 0,1)

In [None]:
accuracy_score(y_train, y_train_pred_class)

In [None]:
accuracy_score(y_test, y_test_pred_class)

In [None]:
confusion_matrix(y_test, y_test_pred_class)

In [None]:
confusion_matrix(y_train, y_train_pred_class)

### Strategy 2: Averaging word2vectors with TF-IDF

In [None]:
vecs, labels = pd.DataFrame(tweet_avg_w2v_tfidf["TwVec"].values.tolist()), tweet_avg_w2v_tfidf["Event"]
X, y, X_train, X_test, y_train, y_test = train_test_datasets(vecs,labels,stratify=True,random=43)

In [None]:
model_tfidf = tf.keras.Sequential([keras.layers.Dense(units=5, input_shape=[50], activation=tf.nn.relu),
                                   keras.layers.Dense(units=1, input_shape=[5], activation=tf.nn.softmax)])
model_tfidf.compile(optimizer='sgd', loss=losses.categorical_crossentropy)

In [None]:
model_tfidf.fit(X_train, y_train, epochs=10)

In [None]:
model_tfidf.evaluate(X_train, y_train),model_tfidf.evaluate(X_test, y_test)

In [None]:
y_train_pred = model_tfidf.predict(X_train)
accuracy_score(y_train, y_train_pred)

In [None]:
y_test_pred = model_tfidf.predict(X_test)
accuracy_score(y_test, y_test_pred)

In [None]:
confusion_matrix(y_test, y_test_pred)

In [None]:
confusion_matrix(y_train, y_train_pred)

### Strategy 3: doc2vec

In [None]:
vecs, labels = pd.DataFrame(tweet_d2v["TwVec"].values.tolist()), tweet_d2v["Event"]
X, y, X_train, X_test, y_train, y_test = train_test_datasets(vecs, labels, stratify=True, random=43)

In [None]:
X_train.shape

In [None]:
model_d2v = tf.keras.Sequential([keras.layers.Dense(units=5, input_shape=[300], activation=tf.nn.relu),
                                 keras.layers.Dense(units=5, input_shape=[5], activation=tf.nn.relu),
                                 keras.layers.Dense(units=5, input_shape=[5], activation=tf.nn.relu),
                                 keras.layers.Dense(units=1, input_shape=[5], activation=tf.nn.softmax)])
model_d2v.compile(optimizer='sgd', loss=losses.categorical_crossentropy)

In [None]:
model_d2v.fit(X_train, y_train, epochs=10)

In [None]:
model_d2v.evaluate(X_train, y_train),model_d2v.evaluate(X_test, y_test)

In [None]:
y_train_pred = model_d2v.predict(X_train)
accuracy_score(y_train, y_train_pred)

In [None]:
y_test_pred = model_d2v.predict(X_test)
accuracy_score(y_test, y_test_pred)

In [None]:
confusion_matrix(y_test, y_test_pred)

In [None]:
confusion_matrix(y_train, y_train_pred)

## FastText corpus

In [None]:
vecs, labels = pd.DataFrame(tweet_avg_ft["TwVec"].values.tolist()), tweet_avg_ft["Event"]
X, y, X_train, X_test, y_train, y_test = train_test_datasets(vecs,labels,stratify=True,random=43)

In [None]:
model_ft = tf.keras.Sequential([keras.layers.Dense(units=10, input_shape=[300], activation=tf.nn.relu),
                                keras.layers.Dense(units=1, input_shape=[10], activation=tf.nn.softmax)])
model_ft.compile(optimizer='sgd', loss=losses.categorical_crossentropy)

In [None]:
model_ft.fit(X_train, y_train, epochs=10)

In [None]:
model_ft.evaluate(X_train, y_train),model_ft.evaluate(X_test, y_test)

In [None]:
y_train_pred = model_ft.predict(X_train)
accuracy_score(y_train, y_train_pred)

In [None]:
y_test_pred = model_ft.predict(X_test)
accuracy_score(y_test, y_test_pred)

In [None]:
confusion_matrix(y_test, y_test_pred)

In [None]:
confusion_matrix(y_train, y_train_pred)