In [1]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score
import re
from bs4 import BeautifulSoup
import pandas as pd
from nltk.stem.porter import PorterStemmer
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier,  AdaBoostClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import numpy as np
import scipy as sp
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from xgboost.sklearn import XGBClassifier

In [2]:
def preprocessor(text):
    # remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

def tokenizer_stem(text):
    porter = PorterStemmer()
    return [porter.stem(word) for word in re.split('\s+', text.strip())]

nltk.download('stopwords')
stop = stopwords.words('english')

def tokenizer_stem_nostop(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

def get_stream(path, size):
    for chunk in pd.read_csv(path, chunksize=size):
        yield chunk

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jinghao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# def preprocessor_all_tag(text, tag):
#     ts = BeautifulSoup(text, 'html.parser').find_all(tag)
#     text = ""
#     for t in ts:
#         text += t.get_text()+ " "
#     r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
#     emoticons = re.findall(r, text)
#     text = re.sub(r, '', text)
    
#     # convert to lowercase and append all emoticons behind (with space in between)
#     # replace('-','') removes nose of emoticons
#     text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
#     return text

# def preprocessor_tag(text):
#     ts = BeautifulSoup(text, 'html.parser').find_all( attrs={"class": "author_name"})
#     #print(ts)
#     text = ""
#     for t in ts:
#         text += t.get_text()+ " "
#     r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
#     emoticons = re.findall(r, text)
#     text = re.sub(r, '', text)
#     print(text)
#     # convert to lowercase and append all emoticons behind (with space in between)
#     # replace('-','') removes nose of emoticons
#     text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
#     return text

In [4]:
# def preprocessor_tag_general(text):
#     ts = BeautifulSoup(text, 'html.parser').find_all('p')
#     text = ""
#     for t in ts:
#         text += t.get_text()+ " "
#     r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
#     emoticons = re.findall(r, text)
#     text = re.sub(r, '', text)
    
#     # convert to lowercase and append all emoticons behind (with space in between)
#     # replace('-','') removes nose of emoticons
#     text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
#     return text

In [3]:
def preprocessor_author(text):
    ts = BeautifulSoup(text, 'html.parser').find_all( attrs={"class": "author_name"})
    #print(ts)
    text = ""
    for t in ts:
        text += t.get_text()+ " "
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    # print(text)
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

def preprocessor_title(text):
    ts = BeautifulSoup(text, 'html.parser').find_all( attrs={"class": "title"})
    #print(ts)
    text = ""
    for t in ts:
        text += t.get_text()+ " "
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    # print(text)
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

def preprocessor_paragraph(text):
    ts = BeautifulSoup(text, 'html.parser').find_all('p')
    #print(ts)
    text = ""
    for t in ts:
        text += t.get_text()+ " "
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
#     print(text)
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

def preprocessor_time(text):
    ts = BeautifulSoup(text, 'html.parser').find_all('time')
    #print(ts)
    text = ""
    for t in ts:
        text += t.get_text()+ " "
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
#     print(text)
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

# def preprocessor_channel(text):
#     ts = BeautifulSoup(text, 'html.parser').find_all('article')
#     # print(ts)
#     text = ""
#     for t in ts:
#         text += t.get_text()+ " "
#     r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
#     emoticons = re.findall(r, text)
#     text = re.sub(r, '', text)
#     print(text)
#     # convert to lowercase and append all emoticons behind (with space in between)
#     # replace('-','') removes nose of emoticons
#     text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
#     return text

In [None]:
# def partial_pipe_fit(pipeline_obj, X, Y, classes):
#     feature = pipeline_obj.named_steps['vect'].fit_transform(X)
#     print(feature.shape)
#     print(Y.shape)
#     feature = pipeline_obj.named_steps['select'].fit_transform(feature, Y)
#     print(feature.shape)
#     print(Y.shape)
#     pipeline_obj.named_steps['clf'].fit(feature, Y)
# def partial_pipe_predict(pipeline_obj, X):
#     feature = pipeline_obj.named_steps['vect'].fit_transform(X)
#     feature = pipeline_obj.named_steps['select'].transform(feature)
#     return pipeline_obj.named_steps['clf'].predict_proba(feature)
# class stacking_classifier():
#     def __init__(self):
#         # pipelines for estimators
#         self.pipes = []
#         self.pipes.append(Pipeline([('vect', TfidfVectorizer( preprocessor=preprocessor_paragraph, tokenizer=tokenizer_stem_nostop)), 
#                   ('select', SelectKBest(chi2, k=500)),
#                   ('clf',RandomForestClassifier(criterion='entropy', n_estimators=200, random_state=1, n_jobs=2, max_depth=8))]))
#         self.pipes.append( Pipeline([('vect', TfidfVectorizer( preprocessor=preprocessor_author, tokenizer=tokenizer_stem_nostop)), 
#                   ('select', SelectKBest(chi2, k=100)),
#                   ('clf',RandomForestClassifier(criterion='entropy', n_estimators=200, random_state=1, n_jobs=2, max_depth=8))]))
#         self.pipes.append( Pipeline([('vect', TfidfVectorizer( preprocessor=preprocessor_time, tokenizer=tokenizer_stem_nostop)), 
#                   ('select', SelectKBest(chi2, k=7)),
#                   ('clf',RandomForestClassifier(criterion='entropy', n_estimators=200, random_state=1, n_jobs=2, max_depth=8))]))
        
#         self.final_est = RandomForestClassifier(criterion='entropy', n_estimators=200, random_state=1, n_jobs=2, max_depth=8)
#     def fit(self, X, Y, classes):
#         self.y_preds = []
#         for p in self.pipes:
#             partial_pipe_fit(p, X_train, y_train, classes=classes)
#             self.y_preds.append(partial_pipe_predict(p, X))
        
#         self.ys = self.y_preds[0]
#         for i in range(1, len(self.y_preds)):
#             self.ys = np.concatenate((self.ys, self.y_preds[i]), axis=1)
        
#         self.final_est.fit(self.ys, Y)
#     def predict(self, X):
#         self.y_preds = []
#         for p in self.pipes:
#             self.y_preds.append(partial_pipe_predict(p, X))
#         self.ys = self.y_preds[0]
#         for i in range(1, len(self.y_preds)):
#             self.ys = np.concatenate((self.ys, self.y_preds[i]), axis=1)
#         return self.final_est.predict(self.ys)
#     def predict_proba(self, X):
#         self.y_preds = []
#         for p in self.pipes:
#             self.y_preds.append(partial_pipe_predict(p, X))
#         self.ys = self.y_preds[0]
#         for i in range(1, len(self.y_preds)):
#             self.ys = np.concatenate((self.ys, self.y_preds[i]), axis=1)
#         return self.final_est.predict_proba(self.ys)

In [4]:
def partial_pipe_fit(pipeline_obj, X, Y, classes):
    feature = pipeline_obj.named_steps['vect'].fit_transform(X)
    print(feature.shape)
    print(Y.shape)
    feature = pipeline_obj.named_steps['select'].fit_transform(feature, Y)
    print(feature.shape)
    print(Y.shape)
    pipeline_obj.named_steps['clf'].fit(feature, Y)
def partial_pipe_predict(pipeline_obj, X):
    feature = pipeline_obj.named_steps['vect'].fit_transform(X)
    feature = pipeline_obj.named_steps['select'].transform(feature)
    return pipeline_obj.named_steps['clf'].predict_proba(feature)
class stacking_classifier():
    def __init__(self):
        # pipelines for estimators
        self.pipes = []
        self.pipes.append(Pipeline([('vect', HashingVectorizer(n_features=2**20, preprocessor=preprocessor_paragraph, tokenizer=tokenizer_stem_nostop, alternate_sign=False)), 
                  ('select', SelectKBest(chi2, k=500)),
                  ('clf',RandomForestClassifier(criterion='entropy', n_estimators=200, random_state=1, n_jobs=2, max_depth=8))]))
        self.pipes.append( Pipeline([('vect', HashingVectorizer(n_features=2**20, preprocessor=preprocessor_author, tokenizer=tokenizer_stem_nostop, alternate_sign=False)), 
                  ('select', SelectKBest(chi2, k=100)),
                  ('clf',RandomForestClassifier(criterion='entropy', n_estimators=200, random_state=1, n_jobs=2, max_depth=8))]))
        self.pipes.append( Pipeline([('vect', HashingVectorizer(n_features=2**20, preprocessor=preprocessor_time, tokenizer=tokenizer_stem_nostop, alternate_sign=False)), 
                  ('select', SelectKBest(chi2, k=7)),
                  ('clf',RandomForestClassifier(criterion='entropy', n_estimators=200, random_state=1, n_jobs=2, max_depth=8))]))
        
        self.final_est = RandomForestClassifier(criterion='entropy', n_estimators=200, random_state=1, n_jobs=2, max_depth=8)
    def fit(self, X, Y, classes):
        self.y_preds = []
        for p in self.pipes:
            partial_pipe_fit(p, X_train, y_train, classes=classes)
            self.y_preds.append(partial_pipe_predict(p, X))
        
        self.ys = self.y_preds[0]
        for i in range(1, len(self.y_preds)):
            self.ys = np.concatenate((self.ys, self.y_preds[i]), axis=1)
        
        self.final_est.fit(self.ys, Y)
    def predict(self, X):
        self.y_preds = []
        for p in self.pipes:
            self.y_preds.append(partial_pipe_predict(p, X))
        self.ys = self.y_preds[0]
        for i in range(1, len(self.y_preds)):
            self.ys = np.concatenate((self.ys, self.y_preds[i]), axis=1)
        return self.final_est.predict(self.ys)
    def predict_proba(self, X):
        self.y_preds = []
        for p in self.pipes:
            self.y_preds.append(partial_pipe_predict(p, X))
        self.ys = self.y_preds[0]
        for i in range(1, len(self.y_preds)):
            self.ys = np.concatenate((self.ys, self.y_preds[i]), axis=1)
        return self.final_est.predict_proba(self.ys)

In [5]:
# clf = XGBClassifier(n_estimators=100, learning_rate= 0.1, max_depth=5, objective='binary:logistic', eval_metric='error')
stream = get_stream(path='/home/jinghao/miniconda3/DL_comp1/train.csv', size=20000)
classes = np.array([-1, 1])
batch = next(stream)
X_train, y_train = batch['Page content'], batch['Popularity']
# print(X_train)
model = stacking_classifier()
train_auc, val_auc = [], []
model.fit(X_train, y_train, classes)

score = roc_auc_score(y_train, model.predict_proba(X_train)[:,1])
train_auc.append(score)
print('train: {}'.format(score))



(20000, 1048576)
(20000,)
(20000, 500)
(20000,)




(20000, 1048576)
(20000,)
(20000, 100)
(20000,)




(20000, 1048576)
(20000,)


In [64]:
# validate
def get_stream_from_x(path, size, specific_x):
    count = 0
    chunk_list = []
    for chunk in pd.read_csv(path, chunksize=1000, iterator=True):
        if(count < specific_x):
            print(count)
            count+=1
        else:
            chunk_list.append(chunk)
    df = pd.concat(chunk_list)
    return df

batch = get_stream_from_x(path='/home/jinghao/miniconda3/DL_comp1/train.csv', size=1000, specific_x=20)
X_val, y = batch['Page content'], batch['Popularity']
print(X_val)

score = roc_auc_score(y, model.predict_proba(X_val)[:,1])

val_auc.append(score)
print('validation: {}'.format(score))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20000    <html><head><div class="article-info"><span cl...
20001    <html><head><div class="article-info"><span cl...
20002    <html><head><div class="article-info"><span cl...
20003    <html><head><div class="article-info"><span cl...
20004    <html><head><div class="article-info"><span cl...
                               ...                        
27638    <html><head><div class="article-info"><span cl...
27639    <html><head><div class="article-info"><span cl...
27640    <html><head><div class="article-info"><span cl...
27641    <html><head><div class="article-info"><span cl...
27642    <html><head><div class="article-info"><span cl...
Name: Page content, Length: 7643, dtype: object




validation: 0.5487137173986938


In [65]:
import _pickle as pkl
# import optimized pickle written in C for serializing and 
# de-serializing a Python object

# dump to disk
pkl.dump(model, open('output/model.pkl', 'wb'))

In [7]:
import _pickle as pkl
model = pkl.load(open('output/model.pkl', 'rb'))

In [None]:
df_test = pd.read_csv('/home/jinghao/miniconda3/DL_comp1/test.csv')
df_test_pred = pd.DataFrame(columns=['Id', 'Popularity'])
# print(df_test)
for index, r in df_test.iterrows():
    pred = model.predict_proba([r['Page content']])[0][1]
    
    print(r['Id'], pred)
    df_test_pred = df_test_pred.append({'Id': int(r['Id']), 'Popularity':pred}, ignore_index=True)
df_test_pred['Id'] = df_test_pred['Id'].astype('int')
df_test_pred.to_csv("./sgd_res.csv", index=False)

In [None]:
# batch_size = 1000
# stream = get_stream(path='/home/jinghao/miniconda3/DL_comp1/train.csv', size=batch_size)
# classes = np.array([-1, 1])
# train_auc, val_auc = [], []
# model = stacking_classifier()
# # we use one batch for training and another for validation in each iteration
# iters = int((25000+batch_size-1)/(batch_size*2))
# for i in range(iters):
#     batch = next(stream)
#     X_train, y_train = batch['Page content'], batch['Popularity']
#     if X_train is None:
#         break
    
#     model.fit(X_train, y_train, classes)
    
#     score = roc_auc_score(y_train, model.predict_proba(X_train)[:,1])
#     train_auc.append(score)
#     print('train: [{}/{}] {}'.format((i+1)*(batch_size*2), 25000, score))

#     # validate
#     batch = next(stream)
#     X_val, y = batch['Page content'], batch['Popularity']

#     score = roc_auc_score(y, model.predict_proba(X_val)[:,1])

#     val_auc.append(score)
#     print('validation: [{}/{}] {}'.format((i+1)*(batch_size*2), 25000, score))

In [None]:
# def pipe_fit(pipeline_obj, X, Y, classes):
#     pipeline_obj.named_steps['clf'].fit(pipeline_obj.named_steps['vect'].fit_transform(X),Y, classes)
# def pipe_predict(pipeline_obj, X):
#     return pipeline_obj.named_steps['clf'].predict_proba(pipeline_obj.named_steps['vect'].transform(X))
# class stacking_xgbclassifier():
#     def __init__(self):
#         # pipelines for estimators
#         self.pipes = []
#         self.pipes.append(Pipeline([('vect', HashingVectorizer(n_features=2**20, 
#                             preprocessor=preprocessor_tag, tokenizer=tokenizer_stem_nostop)), 
#                   ('clf',XGBClassifier(n_estimators=100, learning_rate= 0.1, max_depth=5, objective='binary:logistic', eval_metric='error'))]))
#         self.pipes.append( Pipeline([('vect', HashingVectorizer(n_features=2**20, 
#                             preprocessor=preprocessor, tokenizer=tokenizer_stem_nostop)), 
#                   ('clf',XGBClassifier(n_estimators=100, learning_rate= 0.1, max_depth=5, objective='binary:logistic', eval_metric='error'))]))
        
#         self.final_est = XGBClassifier(n_estimators=100, learning_rate= 0.1, max_depth=5, objective='binary:logistic', eval_metric='error')
#     def fit(self, X, Y, classes):
#         self.y_preds = []
#         for p in self.pipes:
#             pipe_fit(p, X_train, y_train, classes=classes)
#             self.y_preds.append(pipe_predict(p, X))
        
#         self.ys = self.y_preds[0]
#         for i in range(1, len(self.y_preds)):
#             self.ys = np.concatenate((self.ys, self.y_preds[i]), axis=1)
        
#         self.final_est.fit(self.ys, Y, classes=classes)
#     # def predict(self, X):
#     #     self.y_preds = []
#     #     for p in self.pipes:
#     #         self.y_preds.append(partial_pipe_predict(p, X))
#     #     self.ys = self.y_preds[0]
#     #     for i in range(1, len(self.y_preds)):
#     #         self.ys = np.concatenate((self.ys, self.y_preds[i]), axis=1)
#     #     return self.final_est.predict(self.ys)
#     def predict_proba(self, X):
#         self.y_preds = []
#         for p in self.pipes:
#             self.y_preds.append(pipe_predict(p, X))
#         self.ys = self.y_preds[0]
#         for i in range(1, len(self.y_preds)):
#             self.ys = np.concatenate((self.ys, self.y_preds[i]), axis=1)
#         return self.final_est.predict_proba(self.ys)