In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import precision_score, accuracy_score, recall_score, roc_auc_score
from sklearn.decomposition import TruncatedSVD


In [40]:
# df_train = pd.read_csv('./datalabcup1-predicting-news-popularity/train_no_html.csv')
# df_test = pd.read_csv('./datalabcup1-predicting-news-popularity/test_no_html.csv')

df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test.csv')

# df_train = pd.read_csv('./train_no_html.csv')
# df_test = pd.read_csv('./test_no_html.csv')

# print(df.head(5))

In [41]:
print(len(df_train))
print(len(df_test))

27643
11847


In [42]:
import re
from bs4 import BeautifulSoup
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag

nltk.download('stopwords')
stop = stopwords.words('english')

porter = PorterStemmer()
wnl = WordNetLemmatizer()

def preprocessor(text):
    # remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    #  lower case    
    text = text.lower()
    text = text.strip()
    text = re.sub(r'\w*\d\w*', '', text).strip()
#     text = text.translate(str.maketrans('', '', string.punctuation))
    return text


def tokenize_and_stem(text):
    tokens = re.split('\s+', text.strip())
    
    
    stem_tokens = [porter.stem(w) for w in tokens if w not in stop and len(w) > 1]
    
    stem_text = ' '.join(stem_tokens)
    
    return stem_text

def process_text(text):
    words = word_tokenize(text.strip())
    filtered_words = [w for w in words if not w.isdigit() and w not in stop and len(w)>1]
    
    lemmatized_word = [wnl.lemmatize(w) for w in filtered_words]
    
#     tags = pos_tag(filtered_words)
#     acceptable = ['NN', 'NNS', 'VBZ', 'JJ', 'RB' , 'NNP', 'NNPS', 'RBR']
    
#     clean_text = ''
#     for word , tag in tags:
#         if tag in acceptable:
#             clean_text = clean_text + wnl.lemmatize(word) + ' '
    
    return ' '.join(lemmatized_word)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hongyun\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [43]:
df_train['text'] = df_train['Page content'].apply(preprocessor)
df_test['text'] = df_test['Page content'].apply(preprocessor)

In [None]:
df_train['text_processed'] = df_train['text'].apply(process_text)
df_test['text_processed'] = df_test['text'].apply(process_text)
df_train = df_train.drop(columns=['Page content'])
df_test = df_test.drop(columns=['Page content'])
df_train.head()
df_test.head()
df_train.to_csv('./train_no_html.csv', index=False)
df_test.to_csv('./test_no_html.csv', index=False)

In [None]:
x_train = df_train['text_processed'].values
y_train = df_train['Popularity'].values
x_test = df_test['text_processed'].values

In [None]:
# for i , text in enumerate(x_train):
#     processed_text = process_text(text)
#     x_train[i] = processed_text
    
# for i , text in enumerate(x_test):
#     processed_text = process_text(text)
#     x_test[i] = processed_text

In [None]:
print(type(x_train))
print(type(x_train[0]))
x_train = list(x_train)
x_test =  list(x_test)
x = x_train + x_test
print(len(x))

In [None]:
tfidf_vectorize = TfidfVectorizer(max_df=0.6, min_df=0.0001)
tfidf_data = tfidf_vectorize.fit(x)

In [None]:
tfidf_vectorize.get_feature_names()

In [13]:
# tfidf_vectorize.get_feature_names()
x_train_tfidf = tfidf_vectorize.transform(x_train)
x_test_tfidf = tfidf_vectorize.transform(x_test)

In [14]:
y_train[y_train==-1] = 0
print(np.unique(y_train))
print(x_train_tfidf.shape, y_train.shape)
print(x_test_tfidf.shape)

[0 1]
(27643, 65537) (27643,)
(11847, 65537)


In [16]:
x_train , x_val , y_train, y_val = train_test_split(x_train_tfidf, y_train, test_size = 0.2, random_state = 0)

In [17]:
print(x_train.shape, y_train.shape, y_train.sum()/len(y_train))
print(x_val.shape, y_val.shape, y_val.sum()/len(y_val))
print(x_test_tfidf.shape)

(22114, 65537) (22114,) 0.4922221217328389
(5529, 65537) (5529,) 0.4968348706818593
(11847, 65537)


In [28]:
#SVM classifier
SVM = SVC(kernel = 'linear')
SVMClassifier = SVM.fit(x_train, y_train)
y_val_pred = SVMClassifier.predict(x_val)
auc = roc_auc_score(y_val, y_val_pred)
print(f'AUC: {auc}')

AUC: 0.5161862741839539


In [29]:
#Multilayer Perceptron classfier
NN = MLPClassifier(solver = 'lbfgs', alpha = 0.00095, learning_rate = 'adaptive', learning_rate_init = 0.005, max_iter = 300, random_state = 0)
Perceptron = NN.fit(x_train, y_train)
y_val_pred = Perceptron.predict(x_val)
auc = roc_auc_score(y_val, y_val_pred)
print(f'AUC: {auc}')

AUC: 0.5092433363682543


In [30]:
#Naive Bayes classifier
MNB = MultinomialNB()
NBClassifier = MNB.fit(x_train, y_train)
y_val_pred = NBClassifier.predict(x_val)
auc = roc_auc_score(y_val, y_val_pred)
print(f'AUC: {auc}')

AUC: 0.5275867903211582


In [18]:
SVM = SVC(kernel = 'linear', probability = True)
MNB = MultinomialNB()
EnsembleClassifier = VotingClassifier(estimators = [('mnb', MNB), ('svc', SVM)], voting = 'soft')
EnsembleClassifier = EnsembleClassifier.fit(x_train, y_train)

y_val_pred = EnsembleClassifier.predict(x_val)
auc = roc_auc_score(y_val, y_val_pred)
print(f'AUC: {auc}')

AUC: 0.5285473179420357


In [28]:
def test_threshold(model):
    for th in np.arange(0.3, 0.8, 0.1):
        y_val_pred = (model.predict_proba(x_val)[:,1] >= th).astype(int)
        auc = roc_auc_score(y_val, y_val_pred)
        print(f'Threshold: {th} AUC: {auc}')

In [29]:
test_threshold(EnsembleClassifier)

Threshold: 0.3 AUC: 0.5003457140486831
Threshold: 0.4 AUC: 0.5120439211248556
Threshold: 0.5 AUC: 0.5285473179420357
Threshold: 0.6000000000000001 AUC: 0.504144054150178
Threshold: 0.7000000000000002 AUC: 0.5


In [37]:
def predict_and_output(x_test, model, th, file_name):
    y_pred = (model.predict_proba(x_test)[:,1] >= th).astype(int)
    y_pred[y_pred==0] = -1
    df_submission = pd.read_csv('./sample_submission.csv')
    df_submission['Popularity'] = y_pred
    df_submission.to_csv(file_name, index=False)


In [38]:
file_name = './ensemble_svm_mnb.csv'
predict_and_output(x_test_tfidf, EnsembleClassifier, 0.5, file_name)