In [1]:
import nltk
import re
import numpy as np
import math
import pandas as pd
from nltk.corpus import stopwords as StopwordsLoader
from nltk.tokenize.casual import TweetTokenizer
from textblob import TextBlob
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
import sys
np.set_printoptions(threshold=sys.maxsize)
from sklearn import metrics
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import warnings
warnings.filterwarnings("ignore")
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
import evaluation

In [2]:
#To evaluate my code on additional data, the file locations should be entered into 
#the list “filenames”. 
#Then, the whole notebook should be run, with the results being output in the final two cells. 

filenames = ["semeval-tweets/twitter-test1.txt", "semeval-tweets/twitter-test2.txt", "semeval-tweets/twitter-test3.txt"] 

In [3]:
nltk.download("wordnet")


[nltk_data] Downloading package wordnet to /Users/a226/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
train = pd.read_csv('semeval-tweets/twitter-training-data.txt', sep="\t", names=["id", "sentiment", "tweet"])


In [5]:
def load_set(filename):
    list_of_words = []
    with open(filename,"r", encoding='latin-1') as f:
        for line in f.readlines():
            tmp = line.rstrip("\n")
            list_of_words.append(tmp)
    return set(list_of_words)

In [6]:
positivewords = load_set("positive-words.txt")
negativewords = load_set("negative-words.txt")
badwords = load_set("bad-words.txt")
negationwords = {"no", "not","none","nobody","nothing","neither","nowhere",
                    "never","hardly","scarcely","barely","doesnt","isnt","wasnt",
                    "shouldnt","wouldnt","couldnt","wont","cant","dont","arent","amnt"}


# Preprocessing 


In [7]:
def regexp(data):    
    url_pattern = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
    an_pattern = re.compile(r"[.,\!$%\^&\*\(\)=\-_`~\+\{\}<>\?:;\'\"\[\]\|]")
    sl_pattern = re.compile(r"(\b\w\b)")
    pos_em_pattern = re.compile(r':\)|:]|:3|:>|8\)|\(:|=\)|=]|:\'\)|:-\)')
    neg_em_pattern = re.compile(r':\(|:\[|:<|8\(|\):|=\(|=\[|:\'\(|:-\(')
    usermention_pattern = re.compile(r'^(?!.*\bRT\b)(?:.+\s)?@\w+')
    laugh_pattern = re.compile(r"\b(?:a*(?:ha)+h?|h*ha+h[ha]*|(?:l+o+)+l+|o?l+o+l+[ol]*)\b")
    pattern_elonganted = re.compile(r'(.)\1+')
    number_pattern = re.compile(r"\w*\d\w*")
    for i in range(len(data)):
        st = pos_em_pattern.sub('POSEMOJY', data['tweet'][i])
        st = neg_em_pattern.sub('NEGEMOJY', st)
        st = usermention_pattern.sub('USERMENTION', st)
        st = laugh_pattern.sub('LAUGHINTWEET', st)
        st = url_pattern.sub("URLINTWEET", st)
        st = number_pattern.sub("", st )
        st = pattern_elonganted.sub(r'\1\1',st)
        st = an_pattern.sub(" ", st)
        st = sl_pattern.sub("", st)
        st = st.replace("/", " ")
        st = st.replace("\\", " ")
        data['tweet'] = data['tweet'].replace(data['tweet'][i], st)       
    return data



 # Tokenization

In [8]:
def tokenization(data):
    text_tokens = []
    tt = TweetTokenizer()
    for text in data['tweet']:
        text = text.lower()
        text = text.encode("ascii", "ignore")
        text_tokens.append(tt.tokenize(text))   
    return text_tokens



# Lemmatization

In [9]:
def lemmatization(data):
    sentences = []
    for tokens in data:
        sentences.append(lemsentence(tokens))   
    return sentences

def lemsentence(tokens):
    lemm = nltk.stem.WordNetLemmatizer()
    stop_words = StopwordsLoader.words("English")
    sentence = []
    for word, tag in nltk.tag.pos_tag(tokens):
        if word.lower() in stop_words:
            continue
        if word.lower() in negationwords:
            word = "negationintweet"
        if tag.startswith('N'):
            pos = nltk.corpus.wordnet.NOUN
        elif tag.startswith('V'):
            pos = nltk.corpus.wordnet.VERB
        else:
            pos = nltk.corpus.wordnet.ADJ
        
        st = lemm.lemmatize(word, pos).lower()
        sentence.append(st)
    return sentence



In [10]:
def counter(tweet, words_set):
    count = 0
    for word in tweet:
        if word in words_set:
            count += 1
    return count

In [11]:
def create_feature_vector(tweet):
    res = []
    res.append(counter(tweet,positivewords))
    res.append(counter(tweet,negativewords))
    res.append(counter(tweet,badwords))
    return res


In [12]:
def handle_dataset(dataset):
    dataset = regexp(dataset)
    dataset["tokens"] = tokenization(dataset)
    dataset["tokens"] = lemmatization(dataset["tokens"])
    corpus = []
    counts = []
    for tweet in dataset["tokens"]:
        counts.append(create_feature_vector(tweet))
        corpus.append(' '.join(tweet))
    df = pd.DataFrame(counts, columns = ['PositiveWords', 'NegativeWords', 'BadWords']) 
    return corpus, df







In [13]:
print(len(train['sentiment']))

45026


In [14]:
traindata, traincount_df = handle_dataset(train)
train_y = train['sentiment']

In [15]:
# build CountVectorizer on train 
cv = CountVectorizer()
cv_train_features = cv.fit_transform(traindata)
traincv_df = pd.DataFrame(cv_train_features.toarray(),columns=list(cv.get_feature_names()))
ch2cv = SelectKBest(chi2, k=500)
X_traincv = ch2cv.fit_transform(traincv_df, train_y)
cols = ch2cv.get_support(indices=True)
featurescv = traincv_df.columns
featurescv_k = []
for i in cols:
    featurescv_k.append(featurescv[i])
X_traincv = pd.DataFrame(X_traincv,columns = featurescv_k)
X_traincv = pd.concat([X_traincv, traincount_df],  axis=1)
print(X_traincv.columns)
# build TFIDF features on train 
tv = TfidfVectorizer()
tv_train_features = tv.fit_transform(traindata)
traintv_df = pd.DataFrame(tv_train_features.toarray(),columns=list(tv.get_feature_names()))
ch2tv = SelectKBest(chi2, k=500)
X_traintv = ch2tv.fit_transform(traintv_df, train_y)
cols = ch2tv.get_support(indices=True)
featurestv = traintv_df.columns
featurestv_k = []
for i in cols:
    featurestv_k.append(featurestv[i])
X_traintv = pd.DataFrame(X_traintv,columns = featurestv_k)
X_traintv = pd.concat([X_traintv, traincount_df],  axis=1)
print(X_traintv.columns)


Index(['abortion', 'abuse', 'ac', 'aleppo', 'allow', 'alt', 'amaze', 'amazing',
       'amazon', 'amendment',
       ...
       'xx', 'xxl', 'yakub', 'yay', 'yoga', 'york', 'zac', 'PositiveWords',
       'NegativeWords', 'BadWords'],
      dtype='object', length=503)
Index(['abortion', 'ac', 'aleppo', 'allow', 'alt', 'amaze', 'amazing',
       'amazon', 'amendment', 'america',
       ...
       'wtf', 'xx', 'xxl', 'yakub', 'yay', 'yoga', 'zac', 'PositiveWords',
       'NegativeWords', 'BadWords'],
      dtype='object', length=503)


In [16]:
print('CV model features shape:', X_traincv.shape)
print('TFIDF model features shape:', X_traintv.shape)

CV model features shape: (45026, 503)
TFIDF model features shape: (45026, 503)


In [17]:
traindata_tv = X_traintv
traindata_cv = X_traincv

In [18]:
mnb = MultinomialNB()
gnb = GaussianNB()
pac = PassiveAggressiveClassifier(C=0.0001, loss='squared_hinge')
classifiers = [mnb, gnb, pac]
classifiers_names = ["MultinomialNB","GaussianNB","PassiveAggressiveClassifier"]

In [19]:
def testPpeparation(FILENAME, flag = 1):
    test = pd.read_csv(FILENAME, sep="\t", names=["id", "sentiment", "tweet"]
           )
    test_id = test['id']
    test1_y = test['sentiment']
    test, testcount_df  = handle_dataset(test)   
    if flag == 2:
        return test, test1_y, test_id
    cv = CountVectorizer(vocabulary = featurescv_k)
    tv = TfidfVectorizer(vocabulary = featurestv_k)
    cvtest_features = (cv.fit_transform(test)).toarray()
    tvtest_features = (tv.fit_transform(test)).toarray()
    test_tv = pd.DataFrame(tvtest_features, columns=list(tv.get_feature_names()))
    test_tv = pd.concat([test_tv, testcount_df], axis = 1)
    test_cv = pd.DataFrame(cvtest_features, columns=list(cv.get_feature_names()))
    test_cv = pd.concat([test_cv, testcount_df], axis = 1)
    if flag == 1:
        return test_tv, test_cv, test1_y
    else: 
        return test_tv, test_cv, test1_y, test_id



In [20]:

def get_metrics(true_labels, predicted_labels):
    print('Accuracy:', np.round(
                        metrics.accuracy_score(true_labels, 
                                               predicted_labels),
                        4))
    print('F1 Score:', np.round(
                        metrics.f1_score(true_labels, 
                                               predicted_labels,
                                               average='weighted'),
                        4))
                        

In [21]:
def train_predict_model(classifier, 
                        train_features, train_labels, 
                        test_features, test_labels):
    classifier.fit(train_features, train_labels)
    predictions = classifier.predict(test_features) 
    return predictions    


In [None]:
for filee in filenames:
    print(filee)
    test_tv, test_cv, test_y = testPpeparation(filee)
    for i in range(len(classifiers)):
        print("CV" + classifiers_names[i])
        get_metrics(test_y, train_predict_model(classifiers[i], traindata_cv, train_y, test_cv, test_y ) )
        print("TV" + classifiers_names[i])
        get_metrics(test_y, train_predict_model(classifiers[i], traindata_tv, train_y, test_tv, test_y ) )

semeval-tweets/twitter-test1.txt
CVMultinomialNB
Accuracy: 0.629
F1 Score: 0.6194
TVMultinomialNB
Accuracy: 0.6182
F1 Score: 0.5995
CVGaussianNB
Accuracy: 0.4594
F1 Score: 0.3519
TVGaussianNB
Accuracy: 0.4721
F1 Score: 0.3817
CVPassiveAggressiveClassifier
Accuracy: 0.6497
F1 Score: 0.6298
TVPassiveAggressiveClassifier
Accuracy: 0.5811
F1 Score: 0.5602
semeval-tweets/twitter-test2.txt
CVMultinomialNB
Accuracy: 0.6357
F1 Score: 0.6355
TVMultinomialNB
Accuracy: 0.6141
F1 Score: 0.6077
CVGaussianNB
Accuracy: 0.5364
F1 Score: 0.4183
TVGaussianNB
Accuracy: 0.5532
F1 Score: 0.4652
CVPassiveAggressiveClassifier
Accuracy: 0.6336
F1 Score: 0.6296
TVPassiveAggressiveClassifier
Accuracy: 0.6125
F1 Score: 0.6026
semeval-tweets/twitter-test3.txt
CVMultinomialNB
Accuracy: 0.5956
F1 Score: 0.5914
TVMultinomialNB
Accuracy: 0.5851
F1 Score: 0.5708
CVGaussianNB
Accuracy: 0.4704
F1 Score: 0.3742
TVGaussianNB
Accuracy: 0.4834
F1 Score: 0.4071
CVPassiveAggressiveClassifier


# Neural Model

In [None]:
seed = 101 
np.random.seed(seed)

def change(x):
    arr = []
    for elem in x:
        if elem == "positive":
            arr.append(2)
        if elem == "negative":
            arr.append(1)
        if elem == "neutral":
            arr.append(0)
    return np.asarray(arr)
            

In [25]:
X_train = traindata
max_features = 5000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train))
X_train = tokenizer.texts_to_sequences(X_train)
Y_train = change(train_y)
max_words = 50 
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
Y_train = to_categorical(Y_train, 3)
batch_size = 128
epochs = 16
def get_model(max_features, embed_dim, embedding_matrix):
    np.random.seed(seed)
    K.clear_session()
    model = Sequential()
    model.add(Embedding(max_features, embed_dim, input_length=X_train.shape[1],
                       weights=[embedding_matrix],trainable=False))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(3, activation='sigmoid'))
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    print(model.summary())
    return model

def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')
    
def get_embed_mat(EMBEDDING_FILE, max_features=5000):
    embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE, encoding='utf8'))
    print('Found %s word vectors.' % len(embeddings_index))
    word_index = tokenizer.word_index
    num_words = min(max_features, len(word_index) + 1)
    all_embs = np.stack(embeddings_index.values())
    embedding_matrix = np.random.normal(all_embs.mean(), all_embs.std(), 
                                        (num_words, embed_dim))
    for word, i in word_index.items():
        if i >= max_features:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    max_features = embedding_matrix.shape[0]
    
    return max_features, embedding_matrix
    
EMBEDDING_FILE = 'glove.6b/glove.6B.100d.txt'
embed_dim = 100 
max_features, embedding_matrix = get_embed_mat(EMBEDDING_FILE)


Found 400000 word vectors.


In [348]:

model = get_model(max_features, embed_dim, embedding_matrix)
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, verbose=1)


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 100)           500000    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 3)                 303       
Total params: 580,703
Trainable params: 80,703
Non-trainable params: 500,000
_________________________________________________________________
None
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


In [353]:
for filename in filenames:
    X_test, y_test, y_id = testPpeparation(filename, 2)
    X_test = tokenizer.texts_to_sequences(X_test)
    X_test = sequence.pad_sequences(X_test, maxlen=max_words)
    y_test = change(y_test)
    y_pred = model.predict(X_test, verbose=1)
    y_pred = np.argmax(y_pred, axis=1)
    print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.60      0.79      0.68      1504
           1       0.70      0.32      0.44       557
           2       0.72      0.64      0.68      1470

    accuracy                           0.65      3531
   macro avg       0.67      0.58      0.60      3531
weighted avg       0.67      0.65      0.64      3531

              precision    recall  f1-score   support

           0       0.54      0.75      0.62       669
           1       0.71      0.30      0.42       202
           2       0.76      0.65      0.70       982

    accuracy                           0.65      1853
   macro avg       0.67      0.57      0.58      1853
weighted avg       0.68      0.65      0.64      1853

              precision    recall  f1-score   support

           0       0.57      0.80      0.66       983
           1       0.64      0.34      0.44       363
           2       0.71      0.55      0.62      1033

    accuracy        

In [350]:

for filename in filenames:
    test_tv, test_cv, test_y, test_id = testPpeparation(filename, flag = 3)
    for i in range(len(classifiers)):
        clf = classifiers[i]
        print("TV")
        clf.fit(traindata_tv, train_y)
        pred_tv = clf.predict(test_tv)
        pred_dict = {}
        for j in range(len(pred_tv)):
            id_ = test_id[j]
            pred_dict[ str(id_) ] = pred_tv[j]
        #print(pred_dict)   
        evaluation.evaluate(pred_dict, filename, classifiers_names[i])
        evaluation.confusion(pred_dict, filename, classifiers_names[i])
        print("CV")

        clf.fit(traindata_cv, train_y)
        pred_cv = clf.predict(test_cv)
        pred_dict = {}
        for j in range(len(pred_cv)):
            id_ = test_id[j]
            pred_dict[str(id_)] = pred_cv[j]

        evaluation.evaluate(pred_dict, filename, classifiers_names[i])
        evaluation.confusion(pred_dict, filename, classifiers_names[i])


TV
semeval-tweets/twitter-test1.txt (MultinomialNB): 0.457
            positive  negative  neutral
positive    0.694     0.058     0.248     
negative    0.121     0.686     0.193     
neutral     0.273     0.182     0.546     

CV
semeval-tweets/twitter-test1.txt (MultinomialNB): 0.513
            positive  negative  neutral
positive    0.682     0.060     0.258     
negative    0.137     0.641     0.222     
neutral     0.272     0.166     0.563     

TV
semeval-tweets/twitter-test1.txt (GaussianNB): 0.507
            positive  negative  neutral
positive    0.477     0.100     0.423     
negative    0.175     0.450     0.375     
neutral     0.336     0.178     0.486     

CV
semeval-tweets/twitter-test1.txt (GaussianNB): 0.433
            positive  negative  neutral
positive    0.451     0.125     0.423     
negative    0.176     0.562     0.261     
neutral     0.304     0.199     0.497     

TV
semeval-tweets/twitter-test1.txt (PassiveAggressiveClassifier): 0.451
            posit

In [354]:
for filename in filenames:
    X_test, y_test, test_id = testPpeparation(filename, 2)
    X_test = tokenizer.texts_to_sequences(X_test)
    X_test = sequence.pad_sequences(X_test, maxlen=max_words)
    y_test = change(y_test)
    y_pred = model.predict(X_test, verbose=1)
    y_pred = np.argmax(y_pred, axis=1)
    pred_dict = {}
    for j in range(len(y_pred)):
        id_ = test_id[j]
        if y_pred[j] == 1:
            pred_dict[ str(id_) ] = 'negative'
        if y_pred[j] == 2:
            pred_dict[ str(id_) ] = 'positive'
        if y_pred[j] == 0:
            pred_dict[ str(id_) ] = 'neutral'

    evaluation.evaluate(pred_dict, filename, "model")
    evaluation.confusion(pred_dict, filename, "model")



semeval-tweets/twitter-test1.txt (model): 0.521
            positive  negative  neutral
positive    0.722     0.071     0.208     
negative    0.142     0.689     0.169     
neutral     0.274     0.150     0.576     

semeval-tweets/twitter-test2.txt (model): 0.527
            positive  negative  neutral
positive    0.755     0.053     0.193     
negative    0.133     0.733     0.133     
neutral     0.392     0.105     0.503     

semeval-tweets/twitter-test3.txt (model): 0.498
            positive  negative  neutral
positive    0.710     0.091     0.199     
negative    0.165     0.636     0.199     
neutral     0.333     0.125     0.542     

