In [3]:
from pathlib import Path
import re
import pandas as pd
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.corpus import RegexpTokenizer
nltk.download('punkt')
from nltk import sent_tokenize
from gensim.models import Word2Vec
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_fscore_support

import time

[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
import warnings
warnings.filterwarnings("ignore")

# Read all files

In [9]:
pathlist = Path('../../data/unlabeled').glob('**/*.txt')

stcs = []

for path in pathlist:
     # because path is object not string
    path_in_str = str(path)
    print(path_in_str)
    f = open(path_in_str, "r") 
    text = f.read() 
    f.close()
    stcs = stcs + sent_tokenize(text)

../data/unlabeled/relationships/r.txt
18M [Moore, Oklahoma]I just turned 18, I was a child in a guardianship and now they want to kick me out, where do I start?
../data/unlabeled/almosthomeless/almosthomeless.txt
18M [Moore, Oklahoma]I just turned 18, I was a child in a guardianship and now they want to kick me out, where do I start?


In [10]:
df = pd.DataFrame(stcs,columns=['text'])

## Preprocessing

In [4]:
path = '../../data/preprocessed/'
df = pd.read_pickle(path+'unlabeled_stcs.pkl')

In [5]:
tokenizer = RegexpTokenizer(r'[a-zA-Z]{2,}') # remove number and words that length = 1
df['processed_text'] = df['text'].apply(lambda x: tokenizer.tokenize(x.lower()))

In [6]:
df.to_pickle(path+"processed_unlabled.pkl")

## Pretrain Word2Vec

In [9]:
import socket as s

path = '../../data/preprocessed/'
df = pd.read_pickle(path+'processed_unlabled.pkl')
train = pd.read_pickle(path + "train.pkl")
test = pd.read_pickle(path + "test.pkl")

train_token = train['processed_text']
test_token = test['processed_text']
unlabeled_token = df['processed_text']

X = train_token.append(test_token)
X = X.reset_index(drop=True)

corpus = X.append(unlabeled_token)
corpus = corpus.reset_index(drop=True)

In [68]:
word2vec = Word2Vec(corpus,
                size=300,
                window=10,
                min_count=2,
                workers=10,
                iter=100)

In [69]:
m_path = '../../models/'
word2vec.save(m_path+'Word2Vec2.bin')

## Pretrain TF-IDF

In [6]:
class TfidfEmbeddingVectorizer(object):

    def __init__(self, word_model):

        self.word_model = word_model
        self.word_idf_weight = None
        self.vector_size = word_model.wv.vector_size

    def fit(self, docs):  # comply with scikit-learn transformer requirement
        """
        Fit in a list of docs, which had been preprocessed and tokenized,
        such as word bi-grammed, stop-words removed, lemmatized, part of speech filtered.
        Then build up a tfidf model to compute each word's idf as its weight.
        Noted that tf weight is already involved when constructing average word vectors, and thus omitted.
        :param: pre_processed_docs: list of docs, which are tokenized
        :return: self
        """

        text_docs = []
        for doc in docs:
            text_docs.append(" ".join(doc))

        tfidf = TfidfVectorizer() # default 1-gram 
        tfidf.fit(text_docs)  # must be list of text string

        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of
        # known idf's
        max_idf = max(tfidf.idf_)  # used as default value for defaultdict
        self.word_idf_weight = defaultdict(lambda: max_idf,
                                           [(word, tfidf.idf_[i]) for word, i in tfidf.vocabulary_.items()])
        return tfidf


    def transform(self, docs):  # comply with scikit-learn transformer requirement
        doc_word_vector = self.word_average_list(docs)
        return doc_word_vector


    def word_average(self, sent):
        """
        Compute average word vector for a single doc/sentence.
        :param sent: list of sentence tokens
        :return:
            mean: float of averaging word vectors
        """

        mean = []
        for word in sent:
            if word in self.word_model.wv.vocab:
                mean.append(self.word_model.wv.get_vector(word) * self.word_idf_weight[word])  # idf weighted

        if not mean:  # empty words
            # If a text is empty, return a vector of zeros.
            logging.warning("cannot compute average owing to no vector for {}".format(sent))
            return np.zeros(self.vector_size)
        else:
            mean = np.array(mean).mean(axis=0)
            return mean


    def word_average_list(self, docs):
        """
        Compute average word vector for multiple docs, where docs had been tokenized.
        :param docs: list of sentence in list of separated tokens
        :return:
            array of average word vector in shape (len(docs),)
        """
        return np.vstack([self.word_average(sent) for sent in docs])

In [71]:
# word2vec = Word2Vec.load(path+'Word2Vec.bin')
tfidf_vec_tr = TfidfEmbeddingVectorizer(word2vec)
tf_transformer = tfidf_vec_tr.fit(corpus) 

In [None]:
pickle.dump(tf_transformer, open(m_path+"unlabeled_tfidf_transformer.pkl", "wb"))

## Training Unigram TF-IDF

In [10]:
m_path = '../../models/'
tf_transformer = pickle.load(open(m_path+"unlabeled_tfidf_transformer.pkl", 'rb'))

In [14]:
X = X.apply(lambda x: ' '.join(x))

In [16]:
# vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,3)) #unigram, bigram, trigram
# X_new = vectorizer.fit_transform(X)
X_new = tf_transformer.transform(X)

In [31]:
X_train = X_new.toarray()[:len(train)]
X_test = X_new.toarray()[len(train):]
y_train = train['label']
y_test = test['label']


In [32]:
clf_dict = {'log reg': LogisticRegression(random_state=42), 
            'naive bayes': GaussianNB(), 
            'linear svc': LinearSVC(random_state=42),
            'sgd classifier': SGDClassifier(random_state=42),
            'ada boost': AdaBoostClassifier(n_estimators=100, random_state=42),
            'gradient boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
            'CART': DecisionTreeClassifier(random_state=42),
            'random forest': RandomForestClassifier(n_estimators=100, random_state=42)}

In [33]:
for name, clf in clf_dict.items():
    model = clf.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print('Accuracy of {}:'.format(name), accuracy_score(y_pred, y_test))

Accuracy of log reg: 0.7440559440559441
Accuracy of naive bayes: 0.5384615384615384
Accuracy of linear svc: 0.7328671328671329
Accuracy of sgd classifier: 0.7328671328671329
Accuracy of ada boost: 0.6867132867132867
Accuracy of gradient boosting: 0.6853146853146853
Accuracy of CART: 0.5720279720279721
Accuracy of random forest: 0.7020979020979021


In [34]:
D_train = xgb.DMatrix(X_train, label=y_train)
D_test = xgb.DMatrix(X_test, label=y_test)
params = {"objective":'multi:softprob','colsample_bytree': 0.3, 'learning_rate': 0.05,
                'max_depth': 5, 'alpha': 10 , 'num_class': 2, 'random_state': 42}
steps = 500  # The number of training iterations
model = xgb.train(params, D_train, steps)
preds = model.predict(D_test)
y_pred = np.asarray([np.argmax(line) for line in preds])

name = 'XGBoost'
print('Accuracy of {}:'.format(name), accuracy_score(y_pred, y_test))
print('Precision', precision_score(y_test, y_pred))
print('Recall', recall_score(y_test, y_pred))
print('F1-Score', f1_score(y_test, y_pred))

Accuracy of XGBoost: 0.6839160839160839
Precision 0.6838046272493573
Recall 0.7208672086720868
F1-Score 0.7018469656992085


## Training Word2Vec + TF-IDF

In [74]:
tfidf_doc_vec = tfidf_vec_tr.transform(X)

scaler = StandardScaler()
tfidf_doc_vec_scaled = scaler.fit_transform(tfidf_doc_vec)

In [75]:
X_train = tfidf_doc_vec[:len(train)]
y_train = train['label']

X_test = tfidf_doc_vec[len(train):]
y_test = test['label']

In [76]:
clf_dict = {'log reg': LogisticRegression(random_state=42), 
            'naive bayes': GaussianNB(), 
            'linear svc': LinearSVC(random_state=42),
            'sgd classifier': SGDClassifier(random_state=42),
            'ada boost': AdaBoostClassifier(n_estimators=100, random_state=42),
            'gradient boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
            'CART': DecisionTreeClassifier(random_state=42),
            'random forest': RandomForestClassifier(n_estimators=100, random_state=42)}

In [77]:
for name, clf in clf_dict.items():
    model = clf.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print('Accuracy of {}:'.format(name), accuracy_score(y_pred, y_test))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy of log reg: 0.7272727272727273
Accuracy of naive bayes: 0.6615384615384615




Accuracy of linear svc: 0.6993006993006993
Accuracy of sgd classifier: 0.6783216783216783
Accuracy of ada boost: 0.6909090909090909
Accuracy of gradient boosting: 0.7146853146853147
Accuracy of CART: 0.5902097902097903
Accuracy of random forest: 0.7286713286713287


### Random Forest

In [103]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
model = clf.fit(X_train, y_train)
y_pred = model.predict(X_test)
name = 'logistic reg'
print('Accuracy of {}:'.format(name), accuracy_score(y_pred, y_test))
print('Precision', precision_score(y_test, y_pred))
print('Recall', recall_score(y_test, y_pred))
print('F1-Score', f1_score(y_test, y_pred))

Accuracy of logistic reg: 0.7286713286713287
Precision 0.7139364303178484
Recall 0.7913279132791328
F1-Score 0.750642673521851


### XGBoost

In [101]:
D_train = xgb.DMatrix(X_train, label=y_train)
D_test = xgb.DMatrix(X_test, label=y_test)
params = {"objective":'multi:softprob','colsample_bytree': 0.3, 'learning_rate': 0.05,
                'max_depth': 5, 'alpha': 10 , 'num_class': 2, 'random_state': 42}
steps = 500  # The number of training iterations
model = xgb.train(params, D_train, steps)
preds = model.predict(D_test)
y_pred = np.asarray([np.argmax(line) for line in preds])

In [95]:
accs = []
precisions = []
recalls = []
f1s = []

In [97]:
steps = [50, 100, 200, 300, 400, 500]
for step in steps:
    model = xgb.train(params, D_train, step)
    preds = model.predict(D_test)
    y_pred = np.asarray([np.argmax(line) for line in preds])
    accs.append(accuracy_score(y_pred, y_test))
    precisions.append(precision_score(y_test, y_pred))
    recalls.append(recall_score(y_test, y_pred))
    f1s.append(f1_score(y_test, y_pred))

In [102]:
name = 'XGBoost'
print('Accuracy of {}:'.format(name), accuracy_score(y_pred, y_test))
print('Precision', precision_score(y_test, y_pred))
print('Recall', recall_score(y_test, y_pred))
print('F1-Score', f1_score(y_test, y_pred))

Accuracy of XGBoost: 0.7314685314685314
Precision 0.7153284671532847
Recall 0.7967479674796748
F1-Score 0.7538461538461539
