In [1]:
%matplotlib inline
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

import nltk
import gensim


def print_full(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')
    


In [2]:
df_pos = pd.read_table('clickbait_data', header=None, names=['X'])
df_pos['Y'] = True
df_neg = pd.read_table('non_clickbait_data', header=None, names=['X'])
df_neg['Y'] = False

df = pd.concat([df_pos, df_neg], ignore_index = True)
df = df.reindex(np.random.permutation(df.index))

In [105]:
df = df.loc[0:100]

In [3]:
from sklearn.model_selection import KFold


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier


from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score


from sklearn.feature_extraction.text import TfidfVectorizer



def gen_scores(y_true, y_pred, y_pred_proba):
    roc = roc_auc_score(y_true, y_pred_proba)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)

    f1 = f1_score(y_true, y_pred)
    
    return { 'roc': roc, 'precision': precision, 'recall': recall, 'f1': f1 }
   
def process_features(sentence):
#     return re.sub(r'\d+', '0', sentence)
    return sentence
#     ret = ""
    
#     for w in sentence:
#         if w.isdigit() and int(w) < 110:
#             ret += '0'
#         else:
#             ret += w
            
#     return ret
    
size = 30
kf = KFold(n_splits=5, shuffle=True)

for train, test in kf.split(df):
    X_train, X_test, y_train, y_test = df['X'][train], df['X'][test], df['Y'][train], df['Y'][test]
    
    tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word',
                     token_pattern=r'\w{1,}', ngram_range=(1, 3),
                     use_idf=1, smooth_idf=1,
                     sublinear_tf=1, stop_words='english')
    
    # gen features
    X_train = [r for r in X_train.apply(lambda sentence: process_features(sentence))]
    X_train = tfv.fit_transform(X_train)
    y_train = list(y_train)
    classifier = LogisticRegression()
#     classifier = GradientBoostingClassifier()
    classifier.fit(X_train, y_train)
    
    X_test = [r for r in X_test.apply(lambda sentence: process_features(sentence))]
    X_test = tfv.transform(X_test).toarray()
    y_test = list(y_test)
    y_pred = classifier.predict(X_test)
    y_pred_proba = classifier.predict_proba(X_test)[:, 1]
    
    
    print(gen_scores(y_test, y_pred, y_pred_proba))

{'roc': 0.99167133466459179, 'precision': 0.96916158046900092, 'recall': 0.94517543859649122, 'f1': 0.95701823949246623}
{'roc': 0.99043499715905514, 'precision': 0.96282293635790805, 'recall': 0.94117647058823528, 'f1': 0.95187665472667815}
{'roc': 0.99021100542545182, 'precision': 0.96263807667316437, 'recall': 0.94272987591473112, 'f1': 0.95257997106574499}
{'roc': 0.99127239190075611, 'precision': 0.96721843557286591, 'recall': 0.93387652773425256, 'f1': 0.9502551020408162}
{'roc': 0.9906417161914205, 'precision': 0.96422918645140865, 'recall': 0.94420334779913206, 'f1': 0.95411119812059508}


In [347]:
s = 'Won\'t'

classifier.predict_proba(tfv.transform([s]))

array([[ 0.09468002,  0.90531998]])

In [359]:
from sklearn.model_selection import KFold


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier


from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score


from sklearn.feature_extraction.text import TfidfVectorizer

def gen_sentence_features(w2v, sentence):
    features = np.array([0.0]*size)
    for w in sentence:
        if w in w2v:
             features += w2v[w]
    
    return features/len(sentence.split(' '))


def gen_scores(y_true, y_pred, y_pred_proba):
    roc = roc_auc_score(y_true, y_pred_proba)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)

    f1 = f1_score(y_true, y_pred)
    
    return { 'roc': roc, 'precision': precision, 'recall': recall, 'f1': f1 }
   
def process_features(sentence):
#     return re.sub(r'\d+', '0', sentence)
    return sentence
#     ret = ""
    
#     for w in sentence:
#         if w.isdigit() and int(w) < 110:
#             ret += '0'
#         else:
#             ret += w
            
#     return ret
    
size = 30
kf = KFold(n_splits=2, shuffle=True)

index = 0
def get_index():
    index += 1
    return index - 1

for train, test in kf.split(df):
    X_train, X_test, y_train, y_test = df['X'][train], df['X'][test], df['Y'][train], df['Y'][test]
    
    X_train = [LabeledSentence(words=r.split(' '), labels = [get_index()]) for r in X_train]
    model = gensim.models.Doc2Vec(X_train, size=size)
    w2v = dict(zip(model.wv.index2word, model.wv.syn0))
    
    # gen features
    X_train = [r for r in X_train.apply(lambda sentence: gen_sentence_features(w2v, process_features(sentence)))]
    y_train = list(y_train)
    classifier = LogisticRegression()
#     classifier = GradientBoostingClassifier()
    classifier.fit(X_train, y_train)
    
    X_test = [r for r in X_test.apply(lambda sentence: gen_sentence_features(w2v, process_features(sentence)))]
    y_test = list(y_test)
    y_pred = classifier.predict(X_test)
    y_pred_proba = classifier.predict_proba(X_test)[:, 1]
    
    
    print(gen_scores(y_test, y_pred, y_pred_proba))
    

NameError: name 'LabeledSentence' is not defined

In [265]:
{'roc': 0.81284205049237268, 'precision': 0.7266265060240964, 'recall': 0.75718769617074699, 'f1': 0.7415923762680604}
{'roc': 0.81494818875165942, 'precision': 0.73140742509251522, 'recall': 0.76263380632312672, 'f1': 0.7466942904149656}

'0 ab'

In [194]:
X = [r for r in X_train[0:2]]
X

[array([ 0.62771392, -0.37692767, -0.05795803, -0.51460409,  0.56070054]),
 array([ 0.5270079 , -0.18355932, -0.14096526, -0.44264327,  0.32411522])]

In [195]:
lr = LinearRegression()
# X = np.array([np.array([1, 2]), np.array([3, 4])])
y = [1, 2]
X
lr.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

# Save Model

In [362]:
tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word',
                     token_pattern=r'\w{1,}', ngram_range=(1, 3),
                     use_idf=1, smooth_idf=1,
                     sublinear_tf=1, stop_words='english')
    
# gen features
X = [r for r in df['X'].apply(lambda sentence: process_features(sentence))]
X = tfv.fit_transform(X)
y = list(df['Y'])
classifier = LogisticRegression()
#     classifier = GradientBoostingClassifier()
classifier.fit(X, y)

with open('classifier.p', 'wb') as f:
    pickle.dump((classifier, tfv), f)

# print(gen_scores(y_test, y_pred, y_pred_proba))