In [1]:
from abc import ABC
import glob
import csv

from sklearn.utils import shuffle
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import f1_score, make_scorer

class Data(ABC):
    def __init__(self, x_train, y_train, x_test, y_test, split_important=True):
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        self.split_important = split_important
    
    def training_size(self):
        return len(self.x_train)
    
    def testing_size(self):
        return len(self.x_test)
    
    def find_optimum(self, pipeline, parameters):
        grid = GridSearchCV(
            pipeline, 
            n_jobs = 1, 
            verbose = 1, 
            scoring = {
                'f1': make_scorer(f1_score, pos_label=True),
                'precision': 'precision_macro',
                'recall': 'recall_macro'
            }, 
            refit = 'f1', 
            cv=(VariationGenerator(self, 3) if self.split_important else None), 
            param_grid=parameters
        )

        print("Training: {} (Training size: {}, Testing size: {})".format(
            self.__class__.__name__, 
            self.training_size(), 
            self.testing_size()
        ))
              
        grid.fit([x for x in self.x_train + self.x_test], [y for y in self.y_train + self.y_test])  
        for mean, std, prec, recall, params in sorted(
            zip(grid.cv_results_['mean_test_f1'], 
                grid.cv_results_['std_test_f1'],
                grid.cv_results_['mean_test_precision'],
                grid.cv_results_['mean_test_recall'],
                grid.cv_results_['params'],
            ), key = lambda x: x[0], reverse = True)[:3]:

            print("{0:1.3f} (+/-{1:1.3f}; Precision: {2:1.3f}, Recall: {3:1.3f}) for {4}".format(
                mean, 
                std * 2, 
                prec, 
                recall, 
                params
            ))
    
    @staticmethod    
    def _load_file(file):
         with open(file, encoding='ascii', errors='ignore') as spam:
            for row in csv.DictReader(spam, delimiter=',', quotechar='"', skipinitialspace=True, strict=True):
                yield [row['CONTENT'].strip(), row['CLASS'] == '1']
                    
class VariationGenerator:
    def __init__(self, data, n):
        self.n = n
        self.training_size = data.training_size()
        self.testing_size = data.testing_size()

    def split(self, *_):
        for _ in range(self.n):
            yield (
                shuffle(list(range(self.training_size))), 
                shuffle(list(range(self.training_size, self.training_size + self.testing_size)))
            )
            
    def get_n_splits(self, *_):
        return self.n
        
class SingleFile(Data):
    def __init__(self, path, test_ratio=0.3):
        data = [comment for comment in Data._load_file(path)]
        x_train, x_test, y_train, y_test = train_test_split(
            [comment[0] for comment in data], 
            [comment[1] for comment in data], 
            test_size=test_ratio
        )
        super().__init__(x_train, y_train, x_test, y_test, False)
        
    
class SplittedFile(Data):
    def __init__(self, training, testing, test_ratio=0.3):
        training = [comment for comment in Data._load_file(training)]
        testing = [comment for comment in Data._load_file(testing)]
        super().__init__(
            [comment[0] for comment in training],
            [comment[1] for comment in training],
            [comment[0] for comment in testing[:int(len(training) * test_ratio)]],
            [comment[1] for comment in testing[:int(len(training) * test_ratio)]],
            True
        )

class MixedFiles(Data):
    def __init__(self, paths, test_ratio=0.3):
        data = []
        for file in glob.iglob(paths):
            data.extend(comment for comment in Data._load_file(file))
        x_train, x_test, y_train, y_test = train_test_split(
            [comment[0] for comment in data], 
            [comment[1] for comment in data], 
            test_size=test_ratio
        )
        super().__init__(x_train, y_train, x_test, y_test, False)
        
class SplittedMixedFiles(Data):
    def __init__(self, paths, test_ratio=0.3):
        files = glob.glob(paths)
        shuffle(files)
        
        testing = [comment for comment in Data._load_file(files[0])]
        training = []
        for file in files[1:]:
            training.extend(comment for comment in Data._load_file(file))
            
        super().__init__(
            [comment[0] for comment in training],
            [comment[1] for comment in training],
            [comment[0] for comment in testing],
            [comment[1] for comment in testing],
            True
        )

In [2]:
from abc import ABC, abstractmethod
from sklearn.base import BaseEstimator, TransformerMixin

from collections import OrderedDict
import re
import inspect

from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.casual import TweetTokenizer
from nltk.corpus import wordnet, stopwords
from nltk.stem.porter import PorterStemmer

class Preprocessor(ABC):
    @abstractmethod
    def optimize(self, tokenized_comment):
        raise NotImplementedError()
        
    @staticmethod    
    def preprocess(comments, preprocessors):
        tokenizer = TweetTokenizer()
        html_cleaner = re.compile('<.+?>')
        for comment in comments:
            comment = html_cleaner.sub('', comment)
            tokenized_comment = tokenizer.tokenize(comment)
            for preprocessor in preprocessors:
                tokenized_comment = preprocessor.optimize(tokenized_comment)
            yield tokenized_comment

class StandardizePreprocessor(Preprocessor):
    def __init__(self):
        # \/?watch \? v = \S+
        self.regex_url = re.compile(r'[^\s|^\.]+\.[a-z]{2,3}[^\s]*')
        self.regex_number = re.compile(r'\b[0-9]+\b')
        self.regex_emoji = re.compile(r'[\S]{0,3}:[\S]{1,3}')
        self.regex_special = re.compile(r'&[a-z]+;')
        
    def optimize(self, tokenized_comment):   
        return [self.regex_emoji.sub('EMOJII', 
                                     self.regex_number.sub('NUM', 
                                                           self.regex_url.sub('URL', self.regex_special.sub('', word))
                                                          )
                                    ) 
                for word in tokenized_comment]

class SlangPreprocessor(Preprocessor):
    def __init__(self, normalisation_dictionary):
        self.double_character = re.compile(r'(.)\1{2,}')
        
        self.dictionary = {}
        with open(normalisation_dictionary, encoding='ascii', errors='ignore') as f:
            for line in f:
                key, value = line.strip().split("\t")
                self.dictionary[key] = value
            
    def optimize(self, tokenized_comment):
        output = []
        for word in tokenized_comment:
            word = self.double_character.sub(r'\1\1', word)
            if word.lower() in self.dictionary:
                word = self.dictionary[word.lower()]
            output.append(word)
        output = list(OrderedDict.fromkeys(output))
        return output
    
class PosLemmatizationPreprocessor(Preprocessor):
    def __init__(self):
        self.regex_non_word = re.compile(r"[^a-zA-Z\.!?']")
        self.lemmatizer = WordNetLemmatizer()
    
    @staticmethod
    def _tag_to_wordnet(tag):
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return None
    
    def optimize(self, tokenized_comment):
        output = []
        for word in tokenized_comment:
            word = self.regex_non_word.sub('', word).strip()
            if len(word) > 0:
                output.append(word)
                
        for i, (word, tag) in enumerate(pos_tag(output)):
            pos_type = self._tag_to_wordnet(tag)
            if pos_type is not None:
                output[i] = self.lemmatizer.lemmatize(word, pos=pos_type)
            else:
                output[i] = word
                
        return output

class StemmerPreprocessor(Preprocessor):
    def __init__(self):
        self.porter = PorterStemmer()
        
    def optimize(self, tokenized_comment):
        return [self.porter.stem(word) for word in tokenized_comment]
    
class StopwordPreprocessor(Preprocessor):
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        
    def optimize(self, tokenized_comment):
        return [word for word in tokenized_comment if word.lower() not in self.stop_words]
    
class LowercasePreprocessor(Preprocessor):       
    def optimize(self, tokenized_comment):
        return [word.lower() for word in tokenized_comment]

class PunctationRemover(Preprocessor):
    def __init__(self):
        self.char_only = re.compile(r'[^a-zA-Z]')
        
    def optimize(self, tokenized_comment):
        output = []
        for word in tokenized_comment:
            word = self.char_only.sub('', word)
            if len(word) > 0:
                output.append(word)
        return output
    
class PreprocessorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self,
                use_standartize=True,
                use_slang=True,
                use_stopword=True,
                use_lemmatization=True,
                use_stemmer=True,
                use_lowercase=True,
                use_punctation=True):
        args, _, _, values = inspect.getargvalues(inspect.currentframe())
        values.pop("self")

        for arg, val in values.items():
            setattr(self, arg, val)
    
    def fit(self, X, y=None):
        pass
    
    def transform(self, X, y=None):
        preprocessors = []
        if self.use_standartize:
            preprocessors.append(StandardizePreprocessor())
        if self.use_slang:
            preprocessors.append(SlangPreprocessor('dictionaries/slang.txt'))
        if self.use_stopword:
            preprocessors.append(StopwordPreprocessor())
        if self.use_lemmatization:
            preprocessors.append(PosLemmatizationPreprocessor())
        if self.use_stemmer:
            preprocessors.append(StemmerPreprocessor())
        if self.use_lowercase:
            preprocessors.append(LowercasePreprocessor())
        if self.use_punctation:
            preprocessors.append(PunctationRemover())
        return [tokenized for tokenized in Preprocessor.preprocess(X, preprocessors)]
    
    def fit_transform(self, X, y=None):
        return self.transform(X, y)

In [3]:
from collections import Counter
from sklearn.base import BaseEstimator, TransformerMixin

class BagOfWords(BaseEstimator, TransformerMixin):
    def __init__(self, min_occurrences=2, max_features=None):
        self.counter = Counter()
        self.min_occurrences = min_occurrences
        self.max_features = max_features
        self.bow = None
        
    def fit(self, X, y = None):
        for x in X:
            self.counter.update(x)
            
        self.bow = {}
        i = 2
        for word, occurences in self.counter.most_common(self.max_features):
            if occurences >= self.min_occurrences:
                self.bow[word] = i
                i += 1
    
    def transform(self, X, y = None):
        if self.bow is None:
            raise RuntimeError("Fitting required before transform!")
        
        output = []
        for x in X:
            output.append([self.bow[word] if word in self.bow else 1 for word in x])
        return output
    
    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)
    
    def size(self):
        return len(self.bow)

In [4]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import f1_score, accuracy_score

from keras.models import Sequential
from keras.layers import Dense, LSTM, GRU, SimpleRNN, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import pad_sequences

import numpy as np

class RnnClassifier(BaseEstimator, ClassifierMixin):  

    def __init__(self, input_length=32, 
                 embedding_dimension=32, 
                 batch_size=32, 
                 epochs=3, 
                 num_hidden_neurons=100,
                 dropout=0,
                 rnn_type='gru',
                 num_words=2000):
        
        self.input_length = input_length
        self.embedding_dimension = embedding_dimension
        self.batch_size = batch_size
        self.epochs = epochs
        self.num_hidden_neurons = num_hidden_neurons
        self.dropout = dropout
        self.rnn_type = rnn_type
        self.num_words = num_words
        self._rnn = None
        
    def fit(self, X, y=None):
        assert (y is not None), "Y is required"
        assert (self.rnn_type in ['gru', 'lstm', 'simple']), "Invalid RNN type"
        
        X = pad_sequences(X, self.input_length)
        X = np.clip(X, 0, self.num_words - 1)
        
        self._rnn = Sequential()
        self._rnn.add(Embedding(self.num_words, self.embedding_dimension, input_length=self.input_length))
        if self.dropout > 0:
            self._rnn.add(Dropout(self.dropout))
        
        if self.rnn_type is 'gru':
            self._rnn.add(GRU(self.num_hidden_neurons))
        elif self.rnn_type is 'lstm':
            self._rnn.add(LSTM(self.num_hidden_neurons))
        else:
            self._rnn.add(SimpleRNN(self.num_hidden_neurons))
            
        if self.dropout > 0:
            self._rnn.add(Dropout(self.dropout))
        self._rnn.add(Dense(1))
        self._rnn.add(Activation('sigmoid'))
        self._rnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        self._rnn.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, verbose=0)
        return self

    def predict(self, X, y=None):
        if self._rnn is None:
            raise RuntimeError("Fitting required before prediction!")

        X = pad_sequences(X, self.input_length)
        return [prob[0] >= 0.5 for prob in self._rnn.predict(X, batch_size=self.batch_size)]

    def score(self, X, y=None):
        assert (y is not None), "Y is required"
        
        prediction = self.predict(X)
        return f1_score(y, prediction)

Using TensorFlow backend.


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

def dummy(x):
    return x
            
preprocessor = PreprocessorTransformer(use_standartize=False, 
                                       use_slang=False, 
                                       use_stopword=False, 
                                       use_lemmatization=False,
                                       use_stemmer=False,
                                       use_lowercase=False,
                                       use_punctation=False)

vectorizer = CountVectorizer(tokenizer=dummy, preprocessor=dummy, max_features=2000)

rnn_pipeline = Pipeline([
    ("pre", preprocessor),
    ("bow", BagOfWords(max_features=2000)),
    ("rnn", RnnClassifier())
], memory='cache')

gaussian_pipeline = Pipeline([
    ("pre", preprocessor),
    ("vectorizer", vectorizer),
    ("gaussian", MultinomialNB())
], memory='cache')

forest_pipeline = Pipeline([
    ("pre", preprocessor),
    ("vectorizer", vectorizer),
    ("forest", RandomForestClassifier())
], memory='cache')
 
datasets = [
    SingleFile('data/Youtube01-Psy.csv'),
    SplittedFile('data/Youtube01-Psy.csv', 'data/Youtube02-KatyPerry.csv'),
    MixedFiles('data/*.csv'),
    SplittedMixedFiles('data/*.csv')
]

print("RNN:")
for data in datasets:
    data.find_optimum(rnn_pipeline, {
        "rnn__epochs": [3, 10, 20],
        "rnn__num_hidden_neurons": [50, 100, 200],
        "rnn__dropout": [0, 0.1, 0.2],
        "rnn__rnn_type": ['gru', 'lstm', 'simple']
    })

print("\nGaussian:")
for data in datasets:
    data.find_optimum(gaussian_pipeline, {
        "gaussian__alpha": [0.5, 0.75, 1.0, 1.25, 1.5],
        "gaussian__fit_prior": [True, False],
    })

print("\nRandom Forest:")
for data in datasets:
    data.find_optimum(forest_pipeline, {
        "forest__n_estimators": [10, 100, 500, 800],
        "forest__max_features": ['sqrt', 'log2', None],
    })

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

def dummy(x):
    return x

forest_pipeline = Pipeline([
    ("pre", PreprocessorTransformer()),
    ("vectorizer", CountVectorizer(tokenizer=dummy, preprocessor=dummy)),
    ("forest", RandomForestClassifier(n_estimators=100))
], memory='cache')
 
datasets = [
    SingleFile('data/Youtube01-Psy.csv'),
    SplittedFile('data/Youtube01-Psy.csv', 'data/Youtube02-KatyPerry.csv'),
    MixedFiles('data/*.csv'),
    SplittedMixedFiles('data/*.csv')
]
       
for data in datasets:
    data.find_optimum(forest_pipeline, {
        "pre__use_standartize": [True, False],
        "pre__use_slang": [True, False],
        "pre__use_stopword": [True, False],
        "pre__use_lemmatization": [True, False],
        "pre__use_stemmer": [True, False],
        "pre__use_lowercase": [True, False],
        "pre__use_punctation": [True, False]
    })

Training: SingleFile (Training size: 245, Testing size: 105)
Fitting 3 folds for each of 128 candidates, totalling 384 fits
