In [1]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score
import re
from bs4 import BeautifulSoup
import pandas as pd
from nltk.stem.porter import PorterStemmer
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier,  AdaBoostClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import numpy as np
import scipy as sp
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords

In [4]:
def preprocessor(text):
    # remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

def tokenizer_stem(text):
    porter = PorterStemmer()
    return [porter.stem(word) for word in re.split('\s+', text.strip())]

nltk.download('stopwords')
stop = stopwords.words('english')

def tokenizer_stem_nostop(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

def get_stream(path, size):
    for chunk in pd.read_csv(path, chunksize=size):
        yield chunk

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jinghao/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
hashvec = HashingVectorizer(n_features=2**20, 
                            preprocessor=preprocessor, tokenizer=tokenizer_stem_nostop)

# loss='log' gives logistic regression
clf = SGDClassifier(loss='log', max_iter=100, tol=1e-3)
batch_size = 1000
stream = get_stream(path='/home/jinghao/miniconda3/DL_comp1/train.csv', size=batch_size)
classes = np.array([-1, 1])
train_auc, val_auc = [], []
# we use one batch for training and another for validation in each iteration
iters = int((25000+batch_size-1)/(batch_size*2))


img_tag = []
href_tag = []


for i in range(iters):
    paragraph_train =[]
    paragraph_val =[]
    batch = next(stream)
    X_train, y_train = batch['Page content'], batch['Popularity']
    if X_train is None:
        break
    
    # preprocess the X_train as only paragraph part
    for page in X_train:
        soup = BeautifulSoup(page)
        # img_tag.append(len(soup.select('img')))
        # href_tag.append(len(soup.find_all('a', href=True)))
        paragraph_text = ''
        for p in soup.find_all('p'):
            paragraph_text = paragraph_text + p.get_text()
        paragraph_train.append(paragraph_text)
    
    X_train = hashvec.transform(paragraph_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    score = roc_auc_score(y_train, clf.predict_proba(X_train)[:,1])
    train_auc.append(score)
    print('[{}/{}] train score: {}'.format((i+1)*(batch_size*2), 25000, score))
    # validate
    batch = next(stream)
    X_val, y_val = batch['Page content'], batch['Popularity']
    
    # preprocess the X_val as only paragraph part
    for page in X_val:
        soup = BeautifulSoup(page)
        # img_tag.append(len(soup.select('img')))
        # href_tag.append(len(soup.find_all('a', href=True)))
        paragraph_text = ''
        for p in soup.find_all('p'):
            paragraph_text = paragraph_text + p.get_text()
        paragraph_val.append(paragraph_text)
    
    score = roc_auc_score(y_val, clf.predict_proba(hashvec.transform(paragraph_val))[:,1])
    val_auc.append(score)
    print('[{}/{}] val score: {}'.format((i+1)*(batch_size*2), 25000, score))

[2000/25000] train score: 0.9422412308628775
[2000/25000] val score: 0.4964292004931867
[4000/25000] train score: 0.959603643278951
[4000/25000] val score: 0.5367517667384185
[6000/25000] train score: 0.9427157708630834
[6000/25000] val score: 0.5075780303121212
[8000/25000] train score: 0.9349405112074244
[8000/25000] val score: 0.520936059170082
[10000/25000] train score: 0.9131110256903068
[10000/25000] val score: 0.5100352858151023
[12000/25000] train score: 0.9127711634649655
[12000/25000] val score: 0.4941377889604026
[14000/25000] train score: 0.9062116248464994
[14000/25000] val score: 0.5282431507808529
[16000/25000] train score: 0.8763761660643752
[16000/25000] val score: 0.5007580272889824
[18000/25000] train score: 0.8887718483154167
[18000/25000] val score: 0.5239026849262455
[20000/25000] train score: 0.8672649938580099
[20000/25000] val score: 0.5025425950196593
[22000/25000] train score: 0.8397494359909761
[22000/25000] val score: 0.5383981173155739
[24000/25000] train 