In [43]:
# Import Functions
import pandas as pd
import numpy as np
import preprocessor as p
import re
import demoji 
import nltk
from autocorrect import Speller

from matplotlib import pyplot as plt

from sklearn import naive_bayes, svm, metrics, decomposition, ensemble
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Local modules
from pipelines import TwitterPipeline
from datasets import load_tweets, load_annotated_tweets

# Download stopwords package if necessary
#nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

In [2]:
%matplotlib inline
spell = Speller()

In [3]:
# Load the Twitter data
tweets = load_tweets()
# Filter the tweets from Wales and format the text
tweets = TwitterPipeline().apply(tweets.data, verbosity=2)

Filter Tweets from Wales.....completed in 0:00:00.291668
Combine Text fields.....completed in 0:00:00.085749
Convert Geo Coordinates (in Floating Point).....completed in 0:00:00.433206
Collect BoundingBox Coordinates (GeoJSON).....completed in 0:00:01.478479
Collect BoundingBox Coordinates (Tuples).....completed in 0:00:00.245414
Match Local Authorities.....completed in 0:00:31.975333
Set DateTime Index.....completed in 0:00:22.116583


In [4]:
# Load the annotated tweets, combine them with the tweets
annotated = load_annotated_tweets()

In [5]:
# Get the annotated dataset
df = pd.merge(annotated, tweets, on="id_str", how="left")

In [6]:
df = df[["id_str", "text", "support_LH", "support_ND"]]

In [7]:
df.head()

Unnamed: 0,id_str,text,support_LH,support_ND
0,1242070780839636998,Will I [A or B or C] while in isolation?,0,0
1,1242070977321865217,@bbclaurak an someone ask @BorisJohnson as a ...,0,0
2,1242070982531088384,"Exotic Ebony,Torres style,taking shape.\n#luth...",0,0
3,1242071279538188288,@miFutureApp Aww. At least on day one of self ...,0,0
4,1242071760369078278,For anyone struggling to connect with nature i...,2,0


In [8]:
df.shape

(3232, 4)

In [9]:
df.dropna(subset=["text"],inplace=True)

In [10]:
df.shape

(2856, 4)

# Dataset cleaning

In [11]:
def normalise_tweets(text_col: pd.Series):    
    
    # Make sure every item is a string
    text_col = text_col.astype(str)
    
    # Remove hashtags
    text_norm = text_col.apply(lambda x: re.sub("#", "", x))
    
    # Use the tweet preprocesser to remove tweet-specific features from the text
    # It also replaces mentions with the word mention, and hashtags with the word hashtag
    text_norm = text_norm.apply(lambda tweet: p.tokenize(tweet))
    
    # Lower case all words
    text_norm = text_norm.str.lower()
    
    # Remove special characters (make sure to run this AFTER lowercasing)
    text_norm = text_norm.apply(lambda x: re.sub("[^a-z\s]", "", x))
    
    # Removing stopwords (using stopword list from NLTK)
    stop_words = set(stopwords.words('english'))
    text_norm = text_norm.apply(
        lambda x: " ".join(word for word in x.split() if word not in stop_words)
    )
    
#     # Apply spell correcter - Removed due to huge runtime
#     text_norm = text_norm.apply(
#         lambda x: " ".join(spell(word) for word in x.split()))
    
    # Lemmatise the text
    lemmatizer = WordNetLemmatizer() 
    text_norm = text_norm.apply(
        lambda x: " ".join(lemmatizer.lemmatize(word, pos="v") for word in x.split())
    )
    
    return text_norm

In [12]:
df["text_norm"] = normalise_tweets(df["text"])

In [13]:
df.head()

Unnamed: 0,id_str,text,support_LH,support_ND,text_norm
0,1242070780839636998,Will I [A or B or C] while in isolation?,0,0,b c isolation
1,1242070977321865217,@bbclaurak an someone ask @BorisJohnson as a ...,0,0,mention someone ask mention second case covid ...
2,1242070982531088384,"Exotic Ebony,Torres style,taking shape.\n#luth...",0,0,esmileytic ebonytorres styletaking shape luthi...
3,1242071279538188288,@miFutureApp Aww. At least on day one of self ...,0,0,mention aww least day one self isolation work ...
4,1242071760369078278,For anyone struggling to connect with nature i...,2,0,anyone struggle connect nature current selfiso...


## Decide on labels

In [14]:
df.loc[(df['support_LH']!=0) & (df['support_ND']!=0)].shape

(177, 5)

In [15]:
df.loc[df['support_LH']==1].shape

(432, 5)

In [16]:
df.loc[df['support_ND']==1].shape

(221, 5)

In [17]:
df.loc[(df['support_LH']==1) | (df['support_ND']==1)].shape

(492, 5)

In [18]:
# Initiate new column called label
df["label"] = 0

In [19]:
# Make label based on those entries that LH and ND both agreed might be support related
#df["label"].loc[(df['support_LH']!=0) & (df['support_ND']!=0)] = 1

In [20]:
df["label"].value_counts()

0    2856
Name: label, dtype: int64

In [22]:
# Make label based on those entries that LH and ND both agreed might be support related
df["label"].loc[df['support_LH']==1] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [25]:
df["label"].value_counts()

0    2424
1     432
Name: label, dtype: int64

In [23]:
# Set up new dataframe
df = df[["id_str", "text_norm", "label"]]

In [24]:
df.head()

Unnamed: 0,id_str,text_norm,label
0,1242070780839636998,b c isolation,0
1,1242070977321865217,mention someone ask mention second case covid ...,0
2,1242070982531088384,esmileytic ebonytorres styletaking shape luthi...,0
3,1242071279538188288,mention aww least day one self isolation work ...,0
4,1242071760369078278,anyone struggle connect nature current selfiso...,0


## Now lets start getting ready to train a model...

### Test/Train Split

In [26]:
df.set_index('id_str', inplace=True)

In [27]:
X = df["text_norm"]
y = df["label"]

In [28]:
# Create train (80%), test (20%) split 
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

### Generate features

In [29]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=1000)
tfidf_vect.fit(df['text_norm'])
xtrain_tfidf =  tfidf_vect.transform(X_train)
xtest_tfidf =  tfidf_vect.transform(X_test)

In [30]:
# create a count vectorizer object for LDA
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(df['text_norm'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(X_train)
xtest_count =  count_vect.transform(X_test)

In [31]:
# train a LDA Model
lda_model = decomposition.LatentDirichletAllocation(n_components=20, learning_method='online', max_iter=20)
xtrain_lda = lda_model.fit_transform(xtrain_count)
xtest_lda = lda_model.fit_transform(xtest_count)
topic_word = lda_model.components_ 
vocab = count_vect.get_feature_names()

In [37]:
# Create feature matrix
xtest_features = np.concatenate([xtest_tfidf.toarray(), xtest_lda], axis=1)
xtrain_features = np.concatenate([xtrain_tfidf.toarray(), xtrain_lda], axis=1)

In [41]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    return metrics.accuracy_score(predictions, y_test)

In [44]:
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_features, y_train, xtest_features)
print ("NB: ", accuracy)

NB:  0.8513986013986014


In [45]:
accuracy = train_model(svm.SVC(), xtrain_features, y_train, xtest_features)
print ("SVM: ", accuracy)

SVM:  0.8479020979020979


In [46]:
accuracy = train_model(LogisticRegression(), xtrain_features, y_train, xtest_features)
print ("Logistic Regression: ", accuracy)

Logistic Regression:  0.8479020979020979


In [49]:
model = naive_bayes.MultinomialNB()
model.fit(xtrain_features, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [58]:
import pickle
import os
file_out = "tweet_classifer.pkl"
pickle.dump(model, open(file_out, 'wb'))