# 1.Loading JSON dataset

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
from nltk.corpus import stopwords
from textblob import TextBlob
from textblob import Word
import pandas as pd 
import numpy as np 
import nltk
import json
import os
import re

def read_truth(path):
    data = []
    index = []
    for i, line in enumerate(open(path + '/truth.jsonl', 'r')):
        instance = json.loads(line)
        data.append(instance)
        index.append(instance['id'])
    df = pd.DataFrame(data=data, index=index)
    df.sort_index(inplace=True)
    return df

def read_instance(path):
    data = []
    index = []
    for i, line in enumerate(open(path + '/instances.jsonl', 'rb')):
        instance = json.loads(line)
        data.append(instance)
        index.append(instance['id'])
    df = pd.DataFrame(data=data, index=index)
    df.sort_index(inplace=True)
    return df


train_truth = read_truth('./clickbait17-train-170331')
train_instances = read_instance('./clickbait17-train-170331')
validation_truth = read_truth('./clickbait17-validation-170630')
validation_instances = read_instance('./clickbait17-validation-170630')



In [2]:
train = pd.merge(train_truth, train_instances, on = 'id')
validation = pd.merge(validation_truth, validation_instances, on = 'id')
data = pd.concat([train, validation],ignore_index = True)
data['truthClass'] = data['truthClass'].map({'clickbait':True ,'no-clickbait':False}).astype(bool)

# 2.Basic Feature Extraction

## 2.1 number of words

In [3]:
char_title = []
word_title = []
char_paragraph = []
word_paragraph = []
char_caption = []
word_caption = []

char_title = data['targetTitle'].str.len()
word_title = data['targetTitle'].apply(lambda x: len(str(x).split(' ')))
char_paragraph = data['targetParagraphs'].str.len()
word_paragraph = data['targetParagraphs'].apply(lambda x: len(str(x).split(' ')))
char_caption = data['targetCaptions'].str.len()
word_caption = data['targetCaptions'].apply(lambda x: len(str(x).split(' ')))

In [4]:
def avg_word(sentence):
    sentence = str(sentence)
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

avg_paragraph = []
avg_paragraph = data['targetParagraphs'].apply(lambda x: avg_word(x))

In [5]:
char_feature = pd.DataFrame([char_caption,
                             word_caption,
                             char_title,
                             word_title,
                             char_paragraph,
                             word_paragraph,
                             avg_paragraph])
char_feature = char_feature.T
char_feature.rename(columns = {0:'char_caption',
                               1:'word_caption',
                               2:'char_title',
                               3:'word_title',
                               4:'char_paragraph',
                               5:'word_paragraph',
                               6:'avg_paragraph',
                              },inplace = True)

In [21]:
char_feature

Unnamed: 0,targetCaptions,targetCaptions.1,targetTitle,targetTitle.1,targetParagraphs,targetParagraphs.1,targetParagraphs.2
0,0.0,1.0,40.0,6.0,2.0,534.0,4.076779
1,10.0,296.0,106.0,19.0,8.0,192.0,5.213542
2,0.0,1.0,70.0,12.0,12.0,177.0,4.807910
3,1.0,26.0,58.0,10.0,14.0,851.0,4.628672
4,8.0,33.0,97.0,16.0,8.0,105.0,5.342857
5,0.0,1.0,103.0,19.0,6.0,104.0,5.673077
6,1.0,48.0,79.0,11.0,11.0,243.0,5.358025
7,1.0,3.0,19.0,3.0,11.0,331.0,4.993958
8,6.0,10.0,134.0,25.0,13.0,343.0,4.886297
9,0.0,1.0,50.0,7.0,0.0,1.0,2.000000


## 2.2 Text feature

In [7]:
num_at = []
num_acronym = []
num_baity = []#!
num_cap = []
num_digit = []
num_exclm = []
num_money = []
num_ques = []
num_quote = []#!
num_tag = []
num_pic = []
num_parenthesis = []
is_start_num = []
is_superlative = []
is_start5w1h = []

table_currency = ['¥','$','€','£','￠']
table_bracket = ['(',')','[',']','{','}']
table_quote = ["'m","'re","'ve","'d","'s","s'"]
table_baity = ["click here","exclusive","won't believe","happen next","don't want","you know"]
table_5w1h = ["what","why","when","who","which","how"]

for caption in data['targetCaptions']:
    num_pic.append(len(set(caption)))  

for sentences in data['targetTitle']:
    sentences = str(sentences)
    tmp = sentences.split()
    s = str(tmp[0])
    
    question_mark = 0
    start_digit = False
    start_5w1h = False
    parenthesis = 0
    superlative = False
    exclamation = 0
    digital = 0
    acronym = 0
    capital = 0
    money = 0
    baity = 0
    quote = 0
    start = 0
    hash_tag = 0
    at = 0
    
    text = nltk.word_tokenize(sentences)
    part_of_speech = nltk.pos_tag(text)
    for token,tag in part_of_speech:
        if tag in ['RBS','JJS']:
            superlative  = True
            
    if s in table_5w1h:
        start_5w1h = True
    if s.isdigit():
        start_digit = True
        
    for word in sentences.split():        
        if len(word) <= 5:
            acronym += 1
        if len(word) > 5:
            if word.isupper():
                capital += 1                
        for char in word:
            if char == '!':
                exclamation += 1
            if char == '?':
                question_mark += 1
            if char in table_currency:
                money += 1
            if char in table_bracket:
                parenthesis += 1
            if char == '@':
                at += 1
            if char == '#':
                hash_tag += 1
            if char.isdigit:
                digital += 1
    
    num_acronym.append(acronym)
    num_at.append(at)
    num_cap.append(capital)
    num_digit.append(digital)
    num_exclm.append(exclamation)
    num_money.append(money)
    num_parenthesis.append(parenthesis)
    num_ques.append(question_mark)
    num_tag.append(hash_tag)
    is_start5w1h.append(start_5w1h)
    is_start_num.append(start_digit)
    is_superlative.append(superlative)
#     num_baity.append(baity)
#     num_quote.append(quote)


In [8]:
title_feature = pd.DataFrame([num_acronym,
                             num_at,
                             num_cap,
                             num_digit,
                             num_exclm,
                             num_money,
                             num_parenthesis,
                             num_ques,
                             num_tag,
                             num_pic,
                             is_start_num, 
                             is_superlative,
#                              is_start5w1h
                               ])
title_feature = title_feature.T
title_feature.rename(columns = {0:'num_acronym',
                               1:'num_at',
                               2:'num_cap',
                               3:'num_digit',
                               4:'num_exclm',
                               5:'num_money',
                               6:'num_parenthesis',
                               7:'num_ques',
                               8:'num_tag',
                               9:'num_pic',
                               10:'is_start_num',  
                               11:'is_superlative',
#                                10:'is_start5w1h'
                              },inplace = True)

In [9]:
title_feature.describe()

Unnamed: 0,num_acronym,num_at,num_cap,num_digit,num_exclm,num_money,num_parenthesis,num_ques,num_tag,num_pic,is_start_num,is_superlative
count,21997,21997,21997,21997,21997,21997,21997,21997,21997,21997,21997,21997
unique,81,2,4,264,4,5,3,7,3,98,2,2
top,6,0,0,53,0,0,0,0,0,1,False,False
freq,2996,21988,21599,656,21513,21398,21688,20897,21941,6495,21095,21009


## 2.3 Informality and Forward Reference features
+ CLScore = 0.0588*L - 0.296*S - 15.8  

   L = average number of letters     
   S = average number of sentence per 100 words  
   

+ RIX = LW/S  

+ LIX = W/S + (100*LW)/W  

   W = number of words     
   LW number of long words(7+ characters)     
   S = number of sentence  
   
   
+ Formality Measure (fmeasure):  

   (nounfreq+adjectivefreq+prepositionfreq+particlefreq-pronounfreq-verbfreq-adverbfreq-interjectionfreq+100)*0.5   
   

+ Sentiment Analysis:  

   extract polarity as it indicates the sentiment as value nearer to 1 means a positive sentiment and values nearer to -1 means a negative sentiment.

In [10]:
RIX = []
LIX = []
sentiment = []
CLScore = []
f_measure = []

for paragraph in data['targetParagraphs']:
    
    avg_letter = 0
    avg_sentence = 0
    num_word = 0
    num_char = 0
    num_sentence = 0
    long_word = 0
    noun_freq = 0
    adjective_freq = 0
    preposition_freq = 0
    particle_freq = 0
    pronoun_freq = 0
    verb_freq = 0
    adverb_freq = 0
    interjection_freq = 0  
    measure = 0
    
    paragraph = str(paragraph)
    text = nltk.word_tokenize(paragraph)
    part_of_speech = nltk.pos_tag(text)
    for token,tag in part_of_speech:
        if tag in ['NN','NNS','NNP','NNPS']:
            noun_freq += 1
        if tag in ['VB','VBD','VBG','VBN','VBP','VBZ']:
            verb_freq += 1
        if tag == 'UH':
            interjection_freq += 1
        if tag in ['RB','RBS','RBR']:
            adverb_freq += 1
        if tag == 'RP':
            particle_freq += 1
        if tag in ['JJ','JJR','JJS'] :
            adjective_freq += 1
        if tag == 'in':
            preposition_freq += 1
        if tag in ['WRB','WP$','WP','PRP$','PRP']:
            pronoun_freq += 1
    measure = (noun_freq+adjective_freq+preposition_freq+particle_freq-pronoun_freq-verb_freq-adverb_freq-interjection_freq)
        
    for word in paragraph.split():
        num_word += 1
        num_char += len(word)
        if len(word) >= 7:
            long_word += 1
        for char in word:
            if char == '.':
                num_sentence += 1
    
    h_word = num_word//100
    if h_word == 0:
        h_word = 1
    if num_sentence == 0:
        num_sentence = 1
    if num_word == 0:
        num_word = 1

    avg_letter = round((num_char / num_word), 1)
    avg_sentence = round((num_sentence / h_word), 1)
    RIX.append(round(long_word/num_sentence, 1))
    CLScore.append(round(0.0588*avg_letter-0.296*avg_sentence-15.8, 1))
    sentiment.append(round(TextBlob(paragraph).sentiment[0], 1))
    LIX.append(round(num_word/num_sentence + (100*long_word)/num_word, 1))
    f_measure.append((measure+100)/2)
    

In [11]:
content_feature = pd.DataFrame([RIX,
                             LIX,
                             sentiment,
                             CLScore,
                             f_measure,
                               ])
content_feature = content_feature.T
content_feature.rename(columns = {0:'RIX',
                               1:'LIX',
                               2:'sentiment',
                               3:'CLScore',
                               4:'f_measure',
                              },inplace = True)

In [12]:
content_feature.describe()

Unnamed: 0,RIX,LIX,sentiment,CLScore,f_measure
count,21997.0,21997.0,21997.0,21997.0,21997.0
mean,5.565254,47.946152,0.095872,-17.39679,96.145906
std,3.774795,12.432667,0.110498,0.930461,69.958521
min,0.0,1.0,-1.0,-35.2,-521.5
25%,4.0,42.3,0.0,-17.7,63.0
50%,5.2,47.5,0.1,-17.3,81.5
75%,6.5,52.5,0.1,-16.9,111.0
max,142.0,515.2,1.0,-10.1,3373.5


# 3.Basic Pre-processing

## 3.1 Lower case Removing Punctuation Removal of Stop Words

In [13]:
en_corpus = stopwords.words('english')
stop_words = []
caption = []
content = []
title = []

for sentence in data['targetParagraphs']:
    sentence = str(sentence).replace('[^\w\s]','')
    sentence = re.sub('[\u0060|\u0021-\u002c|\u002e-\u002f|\u003a-\u003f|\u2200-\u22ff|\uFB00-\uFFFD|\u2E80-\u33FF]',' ',sentence)
    stop_cnt = 0
    string = ''
#     sentence = TextBlob(sentence).correct()
    for word in sentence.split():
        if word in en_corpus:
            stop_cnt += 1
        if word not in en_corpus:            
            string += ' ' + word.lower()
    stop_words.append(stop_cnt)
    content.append(string)    

for sentence in data['targetCaptions']:
    string = ''
    sentence = str(sentence).replace('[^\w\s]','')
    sentence = re.sub('[\u0060|\u0021-\u002c|\u002e-\u002f|\u003a-\u003f|\u2200-\u22ff|\uFB00-\uFFFD|\u2E80-\u33FF]',' ',sentence)
    for word in sentence.split():
        if word not in en_corpus:            
            string += ' ' + word.lower()
    caption.append(string) 

for sentence in data['targetTitle']:
    string = ''
    sentence = str(sentence).replace('[^\w\s]','')
    sentence = re.sub('[\u0060|\u0021-\u002c|\u002e-\u002f|\u003a-\u003f|\u2200-\u22ff|\uFB00-\uFFFD|\u2E80-\u33FF]',' ',sentence)
    for word in sentence.split():
        if word not in en_corpus:            
            string += ' ' + word.lower()
    title.append(string)
    
corpus = title + content + caption

## 3.2 Common / Rare words removal & Lemmatization

In [14]:
target_caption = []
target_content = []
target_title = []

common = pd.Series(' '.join(caption).split()).value_counts()[:10]
rare = pd.Series(' '.join(caption).split()).value_counts()[-10:]  

for sentence in caption:
    string = ''
    lemma = ''
    sentence = str(sentence)
    for word in sentence.split():
        if word not in common:
            if word  not in rare:
                lemma = Word(word).lemmatize()
                string += ' ' + lemma
    target_caption.append(string)

common = pd.Series(' '.join(content).split()).value_counts()[:10]
rare = pd.Series(' '.join(content).split()).value_counts()[-10:]  

for sentence in content:
    string = ''
    sentence = str(sentence)
    for word in sentence.split():
        if word not in common:
            if word  not in rare:
                lemma = Word(word).lemmatize()
                string += ' ' + lemma
    target_content.append(string)
    
common = pd.Series(' '.join(title).split()).value_counts()[:10]
rare = pd.Series(' '.join(title).split()).value_counts()[-10:]  

for sentence in title:
    string = ''
    sentence = str(sentence)
    for word in sentence.split():
        if word not in common:
            if word  not in rare:
                lemma = Word(word).lemmatize()
                string += ' ' + lemma
    target_title.append(string) 

target_corpus = target_caption + target_content + target_title

# 4.Advance Text Processing
## 4.1 N-grams

In [15]:
TextBlob(corpus[0]).ngrams(2)

[WordList(['tony', 'nominees']),
 WordList(['nominees', 'craziest']),
 WordList(['craziest', 'moments']),
 WordList(['moments', 'stage'])]

## 4.2 TF-IDF

In [16]:
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word', stop_words= 'english',ngram_range=(1,1))
train_vect = tfidf.fit_transform(target_corpus)

train_vect

<65991x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 2246604 stored elements in Compressed Sparse Row format>

In [17]:
count_vectorizer = CountVectorizer(stop_words='english', analyzer='word',ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None)
count_train = count_vectorizer.fit(target_corpus)

bag_of_words = count_vectorizer.transform(target_corpus)

## 4.3 BoW

In [18]:
bow = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,1),analyzer = "word")
train_bow = bow.fit_transform(target_corpus)

train_bow

<65991x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 2729068 stored elements in Compressed Sparse Row format>

## 4.4 Word Embeddings

In [19]:
glove_input_file = 'glove.840B.300d.txt'
word2vec_output_file = 'glove.840B.300d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)
filename = 'glove.840B.300d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)

In [20]:
filename = 'glove.840B.300d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)
model['Award']

array([-5.2931e-01,  1.0563e-01,  7.0719e-01, -3.3744e-01,  9.8959e-02,
       -1.5438e-01,  4.2615e-01, -4.1893e-01,  2.1159e-01,  1.7729e+00,
       -5.0394e-01,  5.6178e-01, -4.4557e-01, -2.8586e-01,  2.7032e-01,
        1.2223e-01,  1.3975e-01,  1.6558e-02,  1.4767e-01,  3.5419e-01,
        7.7916e-02, -6.5874e-01,  3.0605e-01,  4.0619e-02,  2.3684e-01,
       -2.2807e-01, -4.5168e-01, -6.0087e-01,  3.8819e-01,  3.3854e-01,
        5.7853e-01, -2.3238e-01, -5.8090e-01,  8.5179e-01,  2.2848e-01,
       -4.4208e-01, -1.2248e-01, -1.7127e-01,  1.8607e-02,  1.8128e-01,
        4.6069e-01,  3.8400e-01,  1.1403e-02,  1.2466e-01,  5.6139e-01,
       -4.6498e-01, -3.9308e-01,  3.3873e-02, -2.9292e-01, -2.3314e-01,
       -2.9408e-02, -4.8276e-01,  4.1263e-02, -2.2170e-01,  3.5932e-01,
        3.7005e-01,  2.9704e-01, -5.2709e-02, -1.4488e-01,  1.8236e-01,
       -2.1814e-01, -1.0018e-01,  9.0964e-01, -1.9599e-01,  2.9826e-01,
       -3.2692e-01,  4.5438e-01, -6.5441e-01, -9.9964e-02,  1.86