In [1]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score
import re
from bs4 import BeautifulSoup
import pandas as pd
from nltk.stem.porter import PorterStemmer
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier,  AdaBoostClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import numpy as np
import scipy as sp
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from xgboost.sklearn import XGBClassifier

In [2]:
def preprocessor(text):
    # remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

def tokenizer_stem(text):
    porter = PorterStemmer()
    return [porter.stem(word) for word in re.split('\s+', text.strip())]

nltk.download('stopwords')
stop = stopwords.words('english')

def tokenizer_stem_nostop(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

def get_stream(path, size):
    for chunk in pd.read_csv(path, chunksize=size):
        yield chunk

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jinghao/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
# def preprocessor_tag(text, tag):
#     ts = BeautifulSoup(text, 'html.parser').find_all(tag)
#     text = ""
#     for t in ts:
#         text += t.get_text()+ " "
#     r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
#     emoticons = re.findall(r, text)
#     text = re.sub(r, '', text)
    
#     # convert to lowercase and append all emoticons behind (with space in between)
#     # replace('-','') removes nose of emoticons
#     text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
#     return text

def preprocessor_tag(text):
    ts = BeautifulSoup(text, 'html.parser').find_all( attrs={"class": "author_name"})
    #print(ts)
    text = ""
    for t in ts:
        text += t.get_text()+ " "
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    print(text)
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

In [5]:
# def preprocessor_tag_general(text, tag):
#     ts = BeautifulSoup(text, 'html.parser').find_all(tag)
#     text = ""
#     for t in ts:
#         text += t.get_text()+ " "
#     r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
#     emoticons = re.findall(r, text)
#     text = re.sub(r, '', text)
    
#     # convert to lowercase and append all emoticons behind (with space in between)
#     # replace('-','') removes nose of emoticons
#     text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
#     return text
def preprocessor_tag_general(text):
    ts = BeautifulSoup(text, 'html.parser').find_all('p')
    text = ""
    for t in ts:
        text += t.get_text()+ " "
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

def preprocessor_tag(text):
    ts = BeautifulSoup(text, 'html.parser').find_all( attrs={"class": "author_name"})
    #print(ts)
    text = ""
    for t in ts:
        text += t.get_text()+ " "
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    # print(text)
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

In [7]:
hashvec_author = HashingVectorizer(n_features=2**20, 
                            preprocessor=preprocessor_tag, tokenizer=tokenizer_stem_nostop)

hashvec = HashingVectorizer(n_features=2**20, 
                            preprocessor=preprocessor, tokenizer=tokenizer_stem_nostop)

hashvec_paragraph = HashingVectorizer(n_features=2**20, 
                            preprocessor=preprocessor_tag_general, tokenizer=tokenizer_stem_nostop)
                            
# loss='log' gives logistic regression
# clf_author = SGDClassifier(loss='log', max_iter=100, tol=1e-3)
# clf_paragraph = SGDClassifier(loss='log', max_iter=100, tol=1e-3)
# clf = SGDClassifier(loss='log', max_iter=100, tol=1e-3)
clf_author = SGDClassifier(loss='log', max_iter=100, tol=1e-3)
clf_paragraph = SGDClassifier(loss='log', max_iter=100, tol=1e-3)
clf = SGDClassifier(loss='log', max_iter=100, tol=1e-3)

batch_size = 1000
stream = get_stream(path='/home/jinghao/miniconda3/DL_comp1/train.csv', size=batch_size)
classes = np.array([-1, 1])
train_auc, val_auc = [], []
# we use one batch for training and another for validation in each iteration
iters = int((25000+batch_size-1)/(batch_size*2))
for i in range(iters):
    batch = next(stream)
    X_train, y_train = batch['Page content'], batch['Popularity']
    if X_train is None:
        break

    X_train_author = hashvec_author.transform(X_train)
    clf_author.partial_fit(X_train_author, y_train, classes=classes)

    X_train_paragraph = hashvec_paragraph.transform(X_train)
    clf_paragraph.partial_fit(X_train_paragraph, y_train, classes=classes)

    X_train_original = hashvec.transform(X_train)
    clf.partial_fit(X_train_original, y_train, classes=classes)

    author = clf_author.predict_proba(X_train_author)[:,1]
    paragraph = clf_paragraph.predict_proba(X_train_paragraph)[:,1]
    original = clf.predict_proba(X_train_original)[:,1]
    vote_train = (0.4*author+0.3*paragraph+0.3*original)
    
    train_auc.append(roc_auc_score(y_train, vote_train))
    
    # validate
    batch = next(stream)
    X_val, y = batch['Page content'], batch['Popularity']

    X_val_author = hashvec_author.transform(X_val)

    X_val_paragraph = hashvec_paragraph.transform(X_val)

    X_val_original = hashvec.transform(X_val)

    author = clf_author.predict_proba(X_val_author)[:,1]
    paragraph = clf_paragraph.predict_proba(X_val_paragraph)[:,1]
    original = clf.predict_proba(X_val_original)[:,1]
    vote_val = (0.4*author+0.3*paragraph+0.3*original)

    score = roc_auc_score(y, vote_val)

    val_auc.append(score)
    print('[{}/{}] {}'.format((i+1)*(batch_size*2), 25000, score))

[2000/25000] 0.5127499959968616
[4000/25000] 0.5258887806598763
[6000/25000] 0.5201500806003224
[8000/25000] 0.5232613985655739
[10000/25000] 0.5455670934570017
[12000/25000] 0.48864959138528985
[14000/25000] 0.5294155306319247
[16000/25000] 0.5215907772679816
[18000/25000] 0.5280995075034707
[20000/25000] 0.521439661256175
[22000/25000] 0.5373575179303279
[24000/25000] 0.5677330837293397


In [9]:
# import optimized pickle written in C for serializing and 
# de-serializing a Python object
import _pickle as pkl

# dump to disk
pkl.dump(hashvec_author, open('/home/jinghao/miniconda3/DL_comp1/output/hashvec_author.pkl', 'wb'))
pkl.dump(hashvec_paragraph, open('/home/jinghao/miniconda3/DL_comp1/output/hashvec_paragraph.pkl', 'wb'))
pkl.dump(hashvec, open('/home/jinghao/miniconda3/DL_comp1/output/hashvec_original.pkl', 'wb'))

pkl.dump(clf_author, open('/home/jinghao/miniconda3/DL_comp1/output/clf_author.pkl', 'wb'))
pkl.dump(clf_paragraph, open('/home/jinghao/miniconda3/DL_comp1/output/clf_paragraph.pkl', 'wb'))
pkl.dump(clf, open('/home/jinghao/miniconda3/DL_comp1/output/clf_original.pkl', 'wb'))

In [10]:
# load from disk
hashvec_author = pkl.load(open('/home/jinghao/miniconda3/DL_comp1/output/hashvec_author.pkl', 'rb'))
clf_author = pkl.load(open('/home/jinghao/miniconda3/DL_comp1/output/clf_author.pkl', 'rb'))

hashvec_paragraph = pkl.load(open('/home/jinghao/miniconda3/DL_comp1/output/hashvec_paragraph.pkl', 'rb'))
clf_paragraph = pkl.load(open('/home/jinghao/miniconda3/DL_comp1/output/clf_paragraph.pkl', 'rb'))

hashvec_original= pkl.load(open('/home/jinghao/miniconda3/DL_comp1/output/hashvec_original.pkl', 'rb'))
clf_original = pkl.load(open('/home/jinghao/miniconda3/DL_comp1/output/clf_original.pkl', 'rb'))

In [None]:
df_test = pd.read_csv('/home/jinghao/miniconda3/DL_comp1/test.csv')


In [8]:
# hashvec = HashingVectorizer(n_features=2**20, 
#                             preprocessor=preprocessor, tokenizer=tokenizer_stem_nostop)

# # loss='log' gives logistic regression
# clf = SGDClassifier(loss='log', max_iter=100, tol=1e-3)
# batch_size = 1000
# stream = get_stream(path='/home/jinghao/miniconda3/DL_comp1/train.csv', size=batch_size)
# classes = np.array([-1, 1])
# train_auc, val_auc = [], []
# # we use one batch for training and another for validation in each iteration
# iters = int((25000+batch_size-1)/(batch_size*2))


# img_tag = []
# href_tag = []


# for i in range(iters):
#     paragraph_train =[]
#     paragraph_val =[]
#     batch = next(stream)
#     X_train, y_train = batch['Page content'], batch['Popularity']
#     if X_train is None:
#         break
    
#     # preprocess the X_train as only paragraph part
#     for page in X_train:
#         soup = BeautifulSoup(page)
#         # img_tag.append(len(soup.select('img')))
#         # href_tag.append(len(soup.find_all('a', href=True)))
#         paragraph_text = ''
#         for p in soup.find_all('p'):
#             paragraph_text = paragraph_text + p.get_text()
#         paragraph_train.append(paragraph_text)
    
#     X_train = hashvec.transform(paragraph_train)
#     # X_train = tfidf.fit_transform(paragraph_train)
#     # ch2 = SelectKBest(chi2)
#     # X_train = ch2.fit_transform(X_train, y_train)
#     clf.partial_fit(X_train, y_train, classes=classes)

#     score = roc_auc_score(y_train, clf.predict_proba(X_train)[:,1])
#     train_auc.append(score)
#     print('[{}/{}] train score: {}'.format((i+1)*(batch_size*2), 25000, score))
#     # validate
#     batch = next(stream)
#     X_val, y_val = batch['Page content'], batch['Popularity']
    
#     # preprocess the X_val as only paragraph part
#     for page in X_val:
#         soup = BeautifulSoup(page)
#         # img_tag.append(len(soup.select('img')))
#         # href_tag.append(len(soup.find_all('a', href=True)))
#         paragraph_text = ''
#         for p in soup.find_all('p'):
#             paragraph_text = paragraph_text + p.get_text()
#         paragraph_val.append(paragraph_text)
    
#     score = roc_auc_score(y_val, clf.predict_proba(hashvec.transform(paragraph_val))[:,1])
#     val_auc.append(score)
#     print('[{}/{}] val score: {}\n'.format((i+1)*(batch_size*2), 25000, score))
    