In [16]:
import nltk
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from scipy.stats import ttest_ind
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler
from nltk.stem import PorterStemmer


def process(stopwordsOrNot, stemmingOrNot, balanceOrNot):
    ## read data from file
    df = pd.read_excel("SFUcorpus.xlsx")
    df['toxicity_level'] = df['toxicity_level'].apply(lambda x: str(x)[:1])
    X = df['comment_text']
    Y = df['toxicity_level']
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


    ## pre-processing
    stopwords = set(nltk.corpus.stopwords.words('english'))

    ## get rid of noise and stop words
    def preprocess(x,stopwordsOrNot,stemmingOrNot):
        x = re.sub('[^a-z0-9\s]', '', x.lower())
        tmpList = x.split(' ')
        if stemmingOrNot:
            ps = PorterStemmer()
            for i in range(len(tmpList)):
                tmpList[i] = ps.stem(tmpList[i])
        if stopwordsOrNot:
            tmpList = [w for w in tmpList if w not in set(stopwords)]
        return ' '.join(tmpList)

    ## do the same preprocess to train and test set
    X_train = X_train.apply(lambda x: preprocess(x,stopwordsOrNot,stemmingOrNot))
    X_test = X_test.apply(lambda x: preprocess(x,stopwordsOrNot,stemmingOrNot))

    ## remove nan rows
    X_train.dropna(axis=0, how='any', inplace=True)
    X_test.dropna(axis=0, how='any', inplace=True)

    ## bag of words
    vectorizer1 = CountVectorizer()
    X_BOW_train = vectorizer1.fit_transform(X_train)

    ## balance the training set
    oversamples = RandomOverSampler(random_state=6)
    X_BOW_res, y_BOW_res = oversamples.fit_sample(X_BOW_train, y_train)

    ## tf_idf
    vectorizer = TfidfVectorizer()
    X_TFIDF_train = vectorizer.fit_transform(X_train)
    X_TFIDF_res, y_TFIDF_res = oversamples.fit_sample(X_TFIDF_train, y_train)

    ## Dense vector
    map = dict()
    with open('glove.6B.300d.txt', encoding='utf-8') as f:
        for line in f:
            values = line.split(' ')
            word = values[0]
            value = np.asarray(values[1:], dtype='float32')
            map[word] = value

    ## transform df to map
    vectors = []
    for i in X_train:
        vector = np.zeros(300)
        count = 0
        tokens = word_tokenize(i)
        for c in tokens:
            if c in map:
                count += 1
                vector += map[c]
        if count == 0:
            vectors.append(vector)
        else:
            vectors.append(vector / count)

    X_W2V_train = pd.DataFrame(vectors)

    X_W2V_res, y_W2V_res = oversamples.fit_sample(X_W2V_train, y_train)

    ## majority-vote classifier
    majority = DummyClassifier(strategy='most_frequent', random_state=None, constant=None)

    ## logistic regression
    lgm = LogisticRegression(C=10, random_state=0, solver='newton-cg', multi_class='multinomial')

    ## cross_validation
    ## bag of word result
    if balanceOrNot:
        X_BOW = X_BOW_res
        y_BOW = y_BOW_res
    else:
        X_BOW = X_BOW_train
        y_BOW = y_train
    scores_majority = cross_val_score(majority, X_BOW_train, y_train, cv=5)
    scores_lgm_BOW = cross_val_score(lgm, X_BOW, y_BOW, cv=5)
    print("bag of word input")
    print("average accuracy of majority model: ", np.mean(scores_majority))
    print("average accuracy of logistic regression model: ", np.mean(scores_lgm_BOW))

    ## tf_idf result
    if balanceOrNot:
        X_TFIDF = X_TFIDF_res
        y_TFIDF = y_TFIDF_res
    else:
        X_TFIDF = X_TFIDF_train
        y_TFIDF = y_train
    scores_majority = cross_val_score(majority, X_TFIDF_train, y_train, cv=5)
    scores_lgm_TFIDF = cross_val_score(lgm, X_TFIDF, y_TFIDF, cv=5)
    print("tf_idf input")
    print("average accuracy of majority model: ", np.mean(scores_majority))
    print("average accuracy of logistic regression model: ", np.mean(scores_lgm_TFIDF))

    ## Dense vector
    if balanceOrNot:
        X_Dense = X_W2V_res
        y_Dense = y_W2V_res
    else:
        X_Dense = X_W2V_train
        y_Dense = y_train
    scores_majority = cross_val_score(majority, X_W2V_train, y_train, cv=5)
    scores_lgm_Dense = cross_val_score(lgm, X_Dense, y_Dense, cv=5)
    print("dense vector input")
    print("average accuracy of majority model: ", np.mean(scores_majority))
    print("average accuracy of logistic regression model: ", np.mean(scores_lgm_Dense))

    ## statistical tests
    ## BOW t-test
    stat1, p1 = ttest_ind(scores_lgm_BOW, scores_majority)
    print("BOW p value: ",p1)

    ## Dense vector t-test
    stat2, p2 = ttest_ind(scores_lgm_Dense, scores_majority)
    print("dense vector p value: ",p2)

    ## tf_idf t-test
    stat3, p3 = ttest_ind(scores_lgm_TFIDF, scores_majority)
    print("tf_idf p value: ",p3)

    ## predict the test set
    lgm.fit(X_TFIDF, y_TFIDF)
    # X_TFIDF_test = vectorizer.transform(X_test)
    X_TFIDF_test = vectorizer.transform(X_test)
    y_predict = lgm.predict(X_TFIDF_test)
    print(accuracy_score(y_test, y_predict))


if __name__ == '__main__':
    ## do the process with balance and removing stopwords
    print("process 1: do noting")
    process(stemmingOrNot=False, stopwordsOrNot=False, balanceOrNot=False)
    print()
    ## do the process with stemming and removing stopwords
    print("process 2: stemming+stopwords")
    process(stemmingOrNot=True, stopwordsOrNot=True, balanceOrNot=False)
    print()
    ## do the process with stopwords
    print("process 3: stopwords")
    process(stemmingOrNot=False, stopwordsOrNot = True, balanceOrNot = False)
    print()
    ## do the process with stemming
    print("process 4: stemming")
    process(stemmingOrNot=True, stopwordsOrNot = False, balanceOrNot = False)
    print()
    ## do the process with stemming removing stopwords and balance
    print("process 5: stemming+balance+stopwords")
    process(stemmingOrNot=True, stopwordsOrNot = True, balanceOrNot = True)
    
    


process 1: do noting
bag of word input
average accuracy of majority model:  0.790182232936724
average accuracy of logistic regression model:  0.7674488684967727
tf_idf input
average accuracy of majority model:  0.790182232936724
average accuracy of logistic regression model:  0.7949510070767556
dense vector input
average accuracy of majority model:  0.790182232936724
average accuracy of logistic regression model:  0.7289368017212328
BOW p value:  0.005044859961877725
dense vector p value:  0.0001014271690476775
tf_idf p value:  0.2518200012084848
0.8038277511961722

process 2: stemming+stopwords
bag of word input
average accuracy of majority model:  0.790182232936724
average accuracy of logistic regression model:  0.736209010550328
tf_idf input
average accuracy of majority model:  0.790182232936724
average accuracy of logistic regression model:  0.7889774995463619
dense vector input
average accuracy of majority model:  0.790182232936724
average accuracy of logistic regression model:  0