In [1]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score
import re
from bs4 import BeautifulSoup
import pandas as pd
from nltk.stem.porter import PorterStemmer
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier,  AdaBoostClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import numpy as np
import scipy as sp
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from xgboost.sklearn import XGBClassifier

In [2]:
def preprocessor(text):
    # remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

def tokenizer_stem(text):
    porter = PorterStemmer()
    return [porter.stem(word) for word in re.split('\s+', text.strip())]

nltk.download('stopwords')
stop = stopwords.words('english')

def tokenizer_stem_nostop(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

def get_stream(path, size):
    for chunk in pd.read_csv(path, chunksize=size):
        yield chunk

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jinghao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def preprocessor_all_tag(text, tag):
    ts = BeautifulSoup(text, 'html.parser').find_all(tag)
    text = ""
    for t in ts:
        text += t.get_text()+ " "
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

def preprocessor_tag(text):
    ts = BeautifulSoup(text, 'html.parser').find_all( attrs={"class": "author_name"})
    #print(ts)
    text = ""
    for t in ts:
        text += t.get_text()+ " "
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    print(text)
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

In [4]:
def preprocessor_tag_general(text):
    ts = BeautifulSoup(text, 'html.parser').find_all('p')
    text = ""
    for t in ts:
        text += t.get_text()+ " "
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

def preprocessor_tag(text):
    ts = BeautifulSoup(text, 'html.parser').find_all( attrs={"class": "author_name"})
    #print(ts)
    text = ""
    for t in ts:
        text += t.get_text()+ " "
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    # print(text)
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

In [5]:
def partial_pipe_fit(pipeline_obj, X, Y, classes):
    pipeline_obj.named_steps['clf'].partial_fit(pipeline_obj.named_steps['vect'].transform(X),Y, classes)
def partial_pipe_predict(pipeline_obj, X):
    return pipeline_obj.named_steps['clf'].predict_proba(pipeline_obj.named_steps['vect'].transform(X))
class stacking_classifier():
    def __init__(self):
        # pipelines for estimators
        self.pipes = []
        self.pipes.append(Pipeline([('vect', HashingVectorizer(n_features=2**20, 
                            preprocessor=preprocessor_tag, tokenizer=tokenizer_stem_nostop)), 
                  ('clf',SGDClassifier(loss='log',max_iter=1000, tol=1e-5))]))
        self.pipes.append( Pipeline([('vect', HashingVectorizer(n_features=2**20, 
                            preprocessor=preprocessor, tokenizer=tokenizer_stem_nostop)), 
                  ('clf',SGDClassifier(loss='log',max_iter=1000, tol=1e-5))]))
        
        self.final_est = SGDClassifier(loss='log', max_iter=1000, tol=1e-5)
    def fit(self, X, Y, classes):
        self.y_preds = []
        for p in self.pipes:
            partial_pipe_fit(p, X_train, y_train, classes=classes)
            self.y_preds.append(partial_pipe_predict(p, X))
        
        self.ys = self.y_preds[0]
        for i in range(1, len(self.y_preds)):
            self.ys = np.concatenate((self.ys, self.y_preds[i]), axis=1)
        
        self.final_est.partial_fit(self.ys, Y, classes=classes)
    def predict(self, X):
        self.y_preds = []
        for p in self.pipes:
            self.y_preds.append(partial_pipe_predict(p, X))
        self.ys = self.y_preds[0]
        for i in range(1, len(self.y_preds)):
            self.ys = np.concatenate((self.ys, self.y_preds[i]), axis=1)
        return self.final_est.predict(self.ys)
    def predict_proba(self, X):
        self.y_preds = []
        for p in self.pipes:
            self.y_preds.append(partial_pipe_predict(p, X))
        self.ys = self.y_preds[0]
        for i in range(1, len(self.y_preds)):
            self.ys = np.concatenate((self.ys, self.y_preds[i]), axis=1)
        return self.final_est.predict_proba(self.ys)

In [6]:
batch_size = 1000
stream = get_stream(path='/home/jinghao/miniconda3/DL_comp1/train.csv', size=batch_size)
classes = np.array([-1, 1])
train_auc, val_auc = [], []
model = stacking_classifier()
# we use one batch for training and another for validation in each iteration
iters = int((25000+batch_size-1)/(batch_size*2))
for i in range(iters):
    batch = next(stream)
    X_train, y_train = batch['Page content'], batch['Popularity']
    if X_train is None:
        break
    
    model.fit(X_train, y_train, classes)
    
    score = roc_auc_score(y_train, model.predict_proba(X_train)[:,1])
    train_auc.append(score)
    print('train: [{}/{}] {}'.format((i+1)*(batch_size*2), 25000, score))

    # validate
    batch = next(stream)
    X_val, y = batch['Page content'], batch['Popularity']

    score = roc_auc_score(y, model.predict_proba(X_val)[:,1])

    val_auc.append(score)
    print('validation: [{}/{}] {}'.format((i+1)*(batch_size*2), 25000, score))

train: [2000/25000] 0.9259183616260398
validation: [2000/25000] 0.4747562088677524
train: [4000/25000] 0.9479811830647582
validation: [4000/25000] 0.5421632173679506
train: [6000/25000] 0.930063720254881
validation: [6000/25000] 0.5084620338481354
train: [8000/25000] 0.91954105787201
validation: [8000/25000] 0.5292528496413934
train: [10000/25000] 0.923902075725543
validation: [10000/25000] 0.5266579292267365
train: [12000/25000] 0.8989752153489433
validation: [12000/25000] 0.5179426459352536
train: [14000/25000] 0.8984155936623746
validation: [14000/25000] 0.5349773326557805
train: [16000/25000] 0.8770765050284338
validation: [16000/25000] 0.5497917925045301
train: [18000/25000] 0.8899533795799356
validation: [18000/25000] 0.5316842101051806
train: [20000/25000] 0.8737390914656349
validation: [20000/25000] 0.5063978223611251
train: [22000/25000] 0.8391894270308324
validation: [22000/25000] 0.5616915343237705
train: [24000/25000] 0.8667714670858684
validation: [24000/25000] 0.563469015

In [7]:
def pipe_fit(pipeline_obj, X, Y, classes):
    pipeline_obj.named_steps['clf'].fit(pipeline_obj.named_steps['vect'].transform(X),Y, classes)
def pipe_predict(pipeline_obj, X):
    return pipeline_obj.named_steps['clf'].predict_proba(pipeline_obj.named_steps['vect'].transform(X))
class stacking_xgbclassifier():
    def __init__(self):
        # pipelines for estimators
        self.pipes = []
        self.pipes.append(Pipeline([('vect', HashingVectorizer(n_features=2**20, 
                            preprocessor=preprocessor_tag, tokenizer=tokenizer_stem_nostop)), 
                  ('clf',XGBClassifier(n_estimators=100, learning_rate= 0.1, max_depth=5, objective='binary:logistic', eval_metric='error'))]))
        self.pipes.append( Pipeline([('vect', HashingVectorizer(n_features=2**20, 
                            preprocessor=preprocessor, tokenizer=tokenizer_stem_nostop)), 
                  ('clf',XGBClassifier(n_estimators=100, learning_rate= 0.1, max_depth=5, objective='binary:logistic', eval_metric='error'))]))
        
        self.final_est = XGBClassifier(n_estimators=100, learning_rate= 0.1, max_depth=5, objective='binary:logistic', eval_metric='error')
    def fit(self, X, Y, classes):
        self.y_preds = []
        for p in self.pipes:
            pipe_fit(p, X_train, y_train, classes=classes)
            self.y_preds.append(pipe_predict(p, X))
        
        self.ys = self.y_preds[0]
        for i in range(1, len(self.y_preds)):
            self.ys = np.concatenate((self.ys, self.y_preds[i]), axis=1)
        
        self.final_est.fit(self.ys, Y, classes=classes)
    # def predict(self, X):
    #     self.y_preds = []
    #     for p in self.pipes:
    #         self.y_preds.append(partial_pipe_predict(p, X))
    #     self.ys = self.y_preds[0]
    #     for i in range(1, len(self.y_preds)):
    #         self.ys = np.concatenate((self.ys, self.y_preds[i]), axis=1)
    #     return self.final_est.predict(self.ys)
    def predict_proba(self, X):
        self.y_preds = []
        for p in self.pipes:
            self.y_preds.append(pipe_predict(p, X))
        self.ys = self.y_preds[0]
        for i in range(1, len(self.y_preds)):
            self.ys = np.concatenate((self.ys, self.y_preds[i]), axis=1)
        return self.final_est.predict_proba(self.ys)

In [9]:
# clf = XGBClassifier(n_estimators=100, learning_rate= 0.1, max_depth=5, objective='binary:logistic', eval_metric='error')
stream = get_stream(path='/home/jinghao/miniconda3/DL_comp1/train.csv', size=9000)
classes = np.array([-1, 1])
batch = next(stream)
X_train, y_train = batch['Page content'], batch['Popularity']
print(X_train)


model.fit(X_train, y_train, classes)

score = roc_auc_score(y_train, model.predict_proba(X_train)[:,1])
train_auc.append(score)
print('train: [{}/{}] {}'.format((i+1)*(batch_size*2), 25000, score))

0       <html><head><div class="article-info"> <span c...
1       <html><head><div class="article-info"><span cl...
2       <html><head><div class="article-info"><span cl...
3       <html><head><div class="article-info"><span cl...
4       <html><head><div class="article-info"><span cl...
                              ...                        
8995    <html><head><div class="article-info"><span cl...
8996    <html><head><div class="article-info"><span cl...
8997    <html><head><div class="article-info"> <span c...
8998    <html><head><div class="article-info"><span cl...
8999    <html><head><div class="article-info"><span cl...
Name: Page content, Length: 9000, dtype: object
train: [24000/25000] 0.7869366588467712


In [19]:
# validate
def get_stream_from_x(path, size, specific_x):
    count = 0
    chunk_list = []
    for chunk in pd.read_csv(path, chunksize=1000, iterator=True):
        if(count < specific_x):
            print(count)
            count+=1
        else:
            chunk_list.append(chunk)
    df = pd.concat(chunk_list)
    return df

batch = get_stream_from_x(path='/home/jinghao/miniconda3/DL_comp1/train.csv', size=1000, specific_x=9)
X_val, y = batch['Page content'], batch['Popularity']
print(X_val)

score = roc_auc_score(y, model.predict_proba(X_val)[:,1])

val_auc.append(score)
print('validation: [{}/{}] {}'.format((i+1)*(batch_size*2), 25000, score))

0
1
2
3
4
5
6
7
8
9000     <html><head><div class="article-info"><span cl...
9001     <html><head><div class="article-info"><span cl...
9002     <html><head><div class="article-info"><span cl...
9003     <html><head><div class="article-info"><span cl...
9004     <html><head><div class="article-info"><span cl...
                               ...                        
27638    <html><head><div class="article-info"><span cl...
27639    <html><head><div class="article-info"><span cl...
27640    <html><head><div class="article-info"><span cl...
27641    <html><head><div class="article-info"><span cl...
27642    <html><head><div class="article-info"><span cl...
Name: Page content, Length: 18643, dtype: object
validation: [24000/25000] 0.5610519446430982


In [21]:
df_test = pd.read_csv('/home/jinghao/miniconda3/DL_comp1/test.csv')
df_test_pred = pd.DataFrame(columns=['Id', 'Popularity'])
# print(df_test)
for index, r in df_test.iterrows():
    pred = model.predict_proba([r['Page content']])[0][1]
    
    print(r['Id'], pred)
    df_test_pred = df_test_pred.append({'Id': int(r['Id']), 'Popularity':pred}, ignore_index=True)
df_test_pred['Id'] = df_test_pred['Id'].astype('int')
df_test_pred.to_csv("./xgb_res.csv", index=False)

27643 0.6703963017357752
27644 0.7920240429670101
27645 0.11347362080329293
27646 0.36380991496659815
27647 0.18174260239052745
27648 0.2025796318763651
27649 0.0895667363489241
27650 0.43639850173487826
27651 0.7553625232373296
27652 0.5497140533718428
27653 0.5193512175798037
27654 0.5298654590077693
27655 0.6621349165684045
27656 0.3523313898992332
27657 0.4191782754243989
27658 0.17144529338889974
27659 0.8386743052283147
27660 0.18264345493924614
27661 0.2725824688865983
27662 0.18325574971871028
27663 0.7825292430012928
27664 0.7074458638851439
27665 0.396745083964826
27666 0.686086201787187
27667 0.8509561389649264
27668 0.14405820722499535
27669 0.6685305202356294
27670 0.9654829233554513
27671 0.06267749606935276
27672 0.129725113770559
27673 0.29659013255580535
27674 0.6864828084251545
27675 0.737346245298781
27676 0.9337802632362899
27677 0.1340025701805405
27678 0.4257506574783633
27679 0.8739088130173547
27680 0.5097262378846693
27681 0.7435119482637861
27682 0.67919667514