# Notebook for simple text classification

This notebooks contains code for simple classification of tweets into 'offensive' ('OFF') and non-offensive ('NOT'). 

You can use **simple vocabulary-counts as features** by calling:


`path_to_data = 'path/to/your/offenseval_data/` (e.g. '../../../../../Data/offenseval/offenseval2017/')

`classify_count(path_to_data)`

Or you can use **embeddings as features** by calling:

`path_to_model = path/to/embedding_model.bin' (e.g. ../../../../../Data/dsm/word2vec/GoogleNews-vectors-negative300.bin')`

`path_to_data = 'path/to/your/offenseval_data/` (e.g. '../../../../../Data/offenseval/offenseval2017/')

`model_name = 'google_news'` (give your model a name)

`classify_embeddings(path_to_data, path_to_model, model_name)`

The resulting predictions will be stored in the directory ./predictions/ and the performance (f1, recall, precision) will be printed to the screen. 

Note that you have to run all the cells in the notebook before you can call the functions as shown above. 


## Function definitions

Run these cells.

(Feel free to modify the code.)

### Load and preprocess data

In [1]:
# load data

import pandas as pd

def load_data(data_dir, setname):
    test_path = 'offenseval-trial.txt'
    train_path = 'offenseval-training-v1.tsv'
    if setname == 'test':
        filepath = data_dir+test_path
        data = pd.read_csv(filepath, 
                       delimiter = '\t', 
                       header = None,  
                       names=["tweet", "subtask_a", "subtask_b", "subtask_c"])
    elif setname == 'train':
        filepath = data_dir+train_path
        data = pd.read_csv(filepath, delimiter="\t")  
    return data

def split_train(train_data):
    # split 90%, 10%
    total = len(train_data)
    total_90 = round(total * 0.9)
    train_data_split = train_data[:total_90]
    validation_data = train_data[total_90:]
    return train_data_split, validation_data


In [2]:
# Preprocess
from nltk import TweetTokenizer
import string 
# tokenize, remove stop-words

def tokenize(data, remove_stop_words = True):
    tokenized_tweets = []
    tokenizer = TweetTokenizer()
    to_remove = list(string.punctuation)
    to_remove.extend(['@USER', 'URL'])
    for tweet in data['tweet']:
        tokenized_tweet = ' '.join(tokenizer.tokenize(tweet))
        if remove_stop_words == True:
            for char in to_remove:
                tokenized_tweet = tokenized_tweet.replace(char.strip(), '').lower()
        tokenized_tweets.append(tokenized_tweet)
    data['tweet_tok'] = tokenized_tweets
    

### Transform preprocessed tweets to vector representations

In [3]:
# trainsform tweets to vocab count vectors 
from sklearn.feature_extraction.text import CountVectorizer

def tweets_to_count_vec(tweets_train, tweets_test):
    vectorizer = CountVectorizer()
    train_X = vectorizer.fit_transform(tweets_train)
    test_X = vectorizer.transform(tweets_test)
    return train_X, test_X


In [4]:
# transform to embedding vecs (assuming gensim compatible model)
from gensim.models import KeyedVectors
import numpy as np

def tweets_to_embedding(tweets, model_path):
    model = KeyedVectors.load_word2vec_format(model_path, binary = True)
    data_X = []
    for tweet in tweets:
        #tweet was tokenized and joined by ' ' in the previous step
        tokens = tweet.split(' ')
        tweet_vecs = np.array([model[t] for t in tokens if t in model.vocab])
        if len(tweet_vecs) > 1:
            average_embedding = np.mean(tweet_vecs, axis = 0)
        else:
            n_d = len(model['the'])
            average_embedding = np.zeros(n_d)
        data_X.append(average_embedding)
    return np.array(data_X)


### Classify tweets using an SVM binary classifier

In [5]:
# classify
from sklearn import svm

def train(train_X, train_y):
    clf = svm.SVC(gamma='scale')
    clf.fit(train_X, train_y)  
    return clf

def predict(clf, test_X):
    predictions = clf.predict(test_X)
    return predictions


### Write predictions to a file 

In [6]:
# write output to file for further analysis
import os 
import csv

def predictions_to_file(tweets, gold, predictions, name):
    
    if not os.path.isdir('predictions/'):
        os.mkdir('predictions/')
    results_dict_list = []
    
    for tweet, gl, pl in zip(tweets, gold, predictions):
        results_dict = dict()
        results_dict['tweet'] = tweet
        results_dict['gold_label'] = gl
        results_dict['predicted_label'] = pl
        results_dict_list.append(results_dict)
    
    with open(f'predictions/{name}.csv', 'w') as outfile:
        writer = csv.DictWriter(outfile, fieldnames = results_dict_list[0].keys())
        writer.writeheader()
        for d in results_dict_list:
            writer.writerow(d)

### Evaluate using precision, recall and f1

In [7]:
# evaluate

from sklearn.metrics import classification_report

def evaluate(gold, predictions):
    class_report = classification_report(gold, predictions, labels = ['OFF', 'NOT'])
    print(class_report)

### Combine the functions 

In [8]:
def classify_count(path_to_data):
    
    train_data = load_data(data_dir, 'train')
    test_data = load_data(data_dir, 'test')
    train_data, val_data = split_train(train_data)
    
    tokenize(train_data, remove_stop_words = True)
    tokenize(test_data, remove_stop_words = True)
    tokenize(val_data, remove_stop_words = True)
    
    train_X, val_X = tweets_to_count_vec(train_data['tweet_tok'], val_data['tweet_tok']) 
    train_X, test_X = tweets_to_count_vec(train_data['tweet_tok'], test_data['tweet_tok'])
    
    train_y = train_data['subtask_a']
    
    clf = train(train_X, train_y)
    predictions_val = predict(clf, val_X)  
    predictions_test = predict(clf, test_X)
    
    name_val = 'count_svm_val'
    predictions_to_file(val_data['tweet'], val_data['subtask_a'], predictions_val, name_val)
    name_test = 'count_svm_test'
    predictions_to_file(test_data['tweet'], test_data['subtask_a'], predictions_test, name_test)
    
    print('--- performance on the validation set')
    evaluate(val_data['subtask_a'], predictions_val)
    print('--- performance on the test set')
    evaluate(test_data['subtask_a'], predictions_test)
    
    print(f'valdidation predictions written to: predictions/{name_val}.csv')
    print(f'test predictions written to: predictions/{name_test}.csv')


In [9]:
def classify_embeddings(path_to_data, path_to_model, model_name):
    
    train_data = load_data(data_dir, 'train')
    test_data = load_data(data_dir, 'test')
    train_data, val_data = split_train(train_data)
    
    tokenize(train_data, remove_stop_words = True)
    tokenize(test_data, remove_stop_words = True)
    tokenize(val_data, remove_stop_words = True)
    
    train_X = tweets_to_embedding(train_data['tweet_tok'], path_to_model) 
    val_X = tweets_to_embedding(val_data['tweet_tok'], path_to_model)   
    test_X = tweets_to_embedding(test_data['tweet_tok'], path_to_model) 
    
    train_y = train_data['subtask_a']
    
    clf = train(train_X, train_y)
    predictions_val = predict(clf, val_X)
    predictions_test = predict(clf, test_X)
    
    name_val = f'embeddings-{model_name}_svm_val'
    predictions_to_file(val_data['tweet'], val_data['subtask_a'], predictions_val, name_val)
    name_test = f'embeddings-{model_name}_svm_test'
    predictions_to_file(test_data['tweet'], test_data['subtask_a'], predictions_test, name_test)
    
    print('--- performance on the validation set')
    evaluate(val_data['subtask_a'], predictions_val)
    print('--- performance on the test set')
    evaluate(test_data['subtask_a'], predictions_test)
    
    print(f'valdidation predictions written to: predictions/{name_val}.csv')
    print(f'test predictions written to: predictions/{name_test}.csv')
    


## Run your experiments here

Examples are given below. Note that you have to make sure the data are stored on your computer and that you have to modify the filepaths. 

In [10]:

data_dir = '../../../../../Data/offenseval/offenseval2017/'
classify_count(data_dir)

FileNotFoundError: [Errno 2] No such file or directory: '../../../../../Data/offenseval/offenseval2017/offenseval-training-v1.tsv'

In [None]:
path_to_model = '../../../../../Data/dsm/word2vec/GoogleNews-vectors-negative300.bin' 
data_dir = '../../../../../Data/offenseval/offenseval2017/'
model_name = 'google_news'

classify_embeddings(data_dir, path_to_model, model_name)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


--- performance on the validation set
              precision    recall  f1-score   support

         OFF       0.76      0.51      0.61       440
         NOT       0.79      0.92      0.85       884

    accuracy                           0.78      1324
   macro avg       0.77      0.72      0.73      1324
weighted avg       0.78      0.78      0.77      1324

--- performance on the test set
              precision    recall  f1-score   support

         OFF       0.65      0.66      0.66        77
         NOT       0.89      0.89      0.89       243

    accuracy                           0.83       320
   macro avg       0.77      0.78      0.77       320
weighted avg       0.84      0.83      0.83       320

valdidation predictions written to: predictions/embeddings-google_news_svm_val.csv
test predictions written to: predictions/embeddings-google_news_svm_test.csv
