# Classification of twitter water events
### - using classical Machine Learning algorithms

Author: Fadoua Ghourabi (fadouaghourabi@gmail.com)

Date: version @ July 17, 2019 

In [1]:
import os
import time
import pandas as pd
import numpy as np
from ipynb.fs.full.fr_twitter_water_datasets import tweet_avg_w2v, tweet_avg_w2v_tfidf, tweet_d2v, tweet_avg_ft
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, accuracy_score # metric to evaluate the accuracy of the model

=== Datasets for water-related tweets ===
Language: French
Collection date: February 10, 2019 ~ July 20, 2019
Location: Sfax (center), 400km (radius)
Data size: (748, 10)
Features: 

              - Timestamp: date and time of collection. 

              - TwDate: date and time of tweet publication. 

              - TwLoc: localisation of user 

              - TwUserName: user name

              - TwUserID: user's unique ID

              - TwContent: tweet message

              - ContentLoc: list of locations that are included in the tweet

              - urls: list of urls that are included in the tweet

              - Event: label --> water shortage (1) or not (0)





In [2]:
KNN = KNeighborsClassifier()
SVM = SVC(random_state=10)
LR = LogisticRegression(random_state=10)
GNB = GaussianNB()
RF = RandomForestClassifier(random_state=10)
GB = GradientBoostingClassifier(random_state=10)
model_dic = {"KNN":KNN, "SVM": SVM, "LR": LR, "GNB": GNB, "RF": RF, "GB": GB}

In [3]:
def train_test_datasets(vecs,labels,stratify=True,random=10):
    '''
    - Description:
    train_test_datasets generates train and test dataset for ML algorithms.
    - Input:
    vecs: word vector representation of tweets
    labels: labels of tweets (1: water shortage, 0: not water shortage)
    - Output:
    6 dataframes: X (vectors), y (labels), X_train, y_train, X_test, y_test
    - History:
    July 17, 2019 --> implementation, to fix: any datatype of vecs and labels? (check function train_test_split)
    '''
    X, y = vecs, labels
    if stratify:
        X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=random)
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random)
    
    return X, y, X_train, X_test, y_train, y_test

In [4]:
def ML_models(dic, X_train, y_train, X_test, y_test):
    '''
    - Description:
    ML_models applies 6 classification algorithms, namely K-nearest neighbors, support vector machines, 
    logistic regression, gradient boosting trees, random forest and Gaussien Naive Bayes. 
    It also computes, for each model, the accuracy and the confusion matrix for further evluation.
    - Input:
    dic: dictionary of ML algos, e.g. model_dic
    train and test datasets
    - Output:
    models is a dictionary where keys are ML algos and values are tuples of 
    train and test accuracy and confusion matrix
    - History:
    July 17, 2019 --> implementation, to do: default hyperparameters are user, further experiments are needed
    '''
    models={}
    
    for name, algo in dic.items():
        algo.fit(X_train, y_train)
        train_accuracy = algo.score(X_train, y_train)
        test_accuracy = algo.score(X_test, y_test)
        algo_pred = algo.predict(X_test)
        algo_conf = confusion_matrix(y_test, algo_pred)
        models[name]=(algo,train_accuracy,test_accuracy,algo_conf)

    return models

## Water corpus

### Strategy 1: Averaging word2vectors

The vector representation of a tweet is computed as follows: 
Let $\mathcal{V}(M)$ be the vocabulary of a model $M$ and $t_i = (w_{i1}, \cdots, w_{in})$ be a tweet of length $n > 0$.
- we extract words $\mathcal{V}(M)\cap t_i$ that are in the vocabulary of the w2v model
- we convert the words to their w2v representation $M(w_{ij})$
- we deduce the tweet representation  $M(t_i) = \frac{(\sum_{w_{ij}\in\mathcal{V}(M)}(M(w_{ij})))}{|\mathcal{V}(M)\cap t_i|}$

In [5]:
vecs, labels = pd.DataFrame(tweet_avg_w2v["TwVec"].values.tolist()), tweet_avg_w2v["Event"]
X, y, X_train, X_test, y_train, y_test = train_test_datasets(vecs,labels,stratify=True,random=43)

In [6]:
results = ML_models(model_dic, X_train, y_train, X_test, y_test)



In [7]:
results

{'KNN': (KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                       metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                       weights='uniform'),
  0.8716577540106952,
  0.8074866310160428,
  array([[131,  14],
         [ 22,  20]])),
 'SVM': (SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
      decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
      kernel='rbf', max_iter=-1, probability=False, random_state=10,
      shrinking=True, tol=0.001, verbose=False),
  0.7754010695187166,
  0.7754010695187166,
  array([[145,   0],
         [ 42,   0]])),
 'LR': (LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                     intercept_scaling=1, l1_ratio=None, max_iter=100,
                     multi_class='warn', n_jobs=None, penalty='l2',
                     random_state=10, solver='warn', tol=0.0001, verbose=0,
                     warm_start=False),
  0.7754010695187166,
  0

### Strategy 2: Averaging word2vectors with TF-IDF

The vector representation of a tweet is computed as follows: 
Let $\mathcal{V}(M)$ be the vocabulary of a model $M$, $t_i = (w_{i1}, \cdots, w_{in})$ be a tweet of length $n > 0$ and TFIDF be a TF-IDF function.
- we extract words $\mathcal{V}(M)\cap t_i$ that are in the vocabulary of the w2v model
- we convert the words to their w2v representation $M(w_{ij})\times \text{TFIDF}(w_{ij})$
- we deduce the tweet representation  $M(t_i) = \frac{(\sum_{w_{ij}\in\mathcal{V}(M)}(M(w_{ij}\times \text{TFIDF}(w_{ij}))))}{|\mathcal{V}(M)\cap t_i|}$

In [8]:
vecs, labels = pd.DataFrame(tweet_avg_w2v_tfidf["TwVec"].values.tolist()), tweet_avg_w2v_tfidf["Event"]
X, y, X_train, X_test, y_train, y_test = train_test_datasets(vecs,labels,stratify=True,random=43)

In [9]:
results = ML_models(model_dic, X_train, y_train, X_test, y_test)



In [10]:
results

{'KNN': (KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                       metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                       weights='uniform'),
  0.8948306595365418,
  0.8181818181818182,
  array([[132,  13],
         [ 21,  21]])),
 'SVM': (SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
      decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
      kernel='rbf', max_iter=-1, probability=False, random_state=10,
      shrinking=True, tol=0.001, verbose=False),
  0.7754010695187166,
  0.7754010695187166,
  array([[145,   0],
         [ 42,   0]])),
 'LR': (LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                     intercept_scaling=1, l1_ratio=None, max_iter=100,
                     multi_class='warn', n_jobs=None, penalty='l2',
                     random_state=10, solver='warn', tol=0.0001, verbose=0,
                     warm_start=False),
  0.7754010695187166,
  0

### Strategy 3: doc2vec

In [11]:
vecs, labels = pd.DataFrame(tweet_d2v["TwVec"].values.tolist()), tweet_d2v["Event"]
X, y, X_train, X_test, y_train, y_test = train_test_datasets(vecs,labels,stratify=True,random=43)

In [12]:
results = ML_models(model_dic, X_train, y_train, X_test, y_test)



In [13]:
results

{'KNN': (KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                       metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                       weights='uniform'),
  0.8057040998217468,
  0.7540106951871658,
  array([[138,   7],
         [ 39,   3]])),
 'SVM': (SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
      decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
      kernel='rbf', max_iter=-1, probability=False, random_state=10,
      shrinking=True, tol=0.001, verbose=False),
  0.7754010695187166,
  0.7754010695187166,
  array([[145,   0],
         [ 42,   0]])),
 'LR': (LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                     intercept_scaling=1, l1_ratio=None, max_iter=100,
                     multi_class='warn', n_jobs=None, penalty='l2',
                     random_state=10, solver='warn', tol=0.0001, verbose=0,
                     warm_start=False),
  0.7754010695187166,
  0

## FastText corpus

In [14]:
vecs, labels = pd.DataFrame(tweet_avg_ft["TwVec"].values.tolist()), tweet_avg_ft["Event"]
X, y, X_train, X_test, y_train, y_test = train_test_datasets(vecs,labels,stratify=True,random=43)

In [15]:
results = ML_models(model_dic, X_train, y_train, X_test, y_test)



In [16]:
results

{'KNN': (KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                       metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                       weights='uniform'),
  0.9251336898395722,
  0.8663101604278075,
  array([[131,  14],
         [ 11,  31]])),
 'SVM': (SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
      decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
      kernel='rbf', max_iter=-1, probability=False, random_state=10,
      shrinking=True, tol=0.001, verbose=False),
  0.7754010695187166,
  0.7754010695187166,
  array([[145,   0],
         [ 42,   0]])),
 'LR': (LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                     intercept_scaling=1, l1_ratio=None, max_iter=100,
                     multi_class='warn', n_jobs=None, penalty='l2',
                     random_state=10, solver='warn', tol=0.0001, verbose=0,
                     warm_start=False),
  0.8413547237076648,
  0