<h1> <b>DM Second project</b> </h1> 

## Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import sklearn.datasets as ds
import sklearn.model_selection as cv
import sklearn.neighbors as nb
import pandas as pd
import nltk
import re

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

In [None]:
from collections import Counter
from nltk import word_tokenize, sent_tokenize
from nltk.stem.snowball import EnglishStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

## Preprocessing

### Feature Extraction

| Nr. Stopwords | POSTAG  | Nr. Words| Redability Score  |  MTLD |
|---|---|---|---|---|
|   |   |   |   
|   |   |   |   
|   |   |   |   

### Reading data

In [None]:
dt_true = pd.read_csv('clickbait_data.txt', sep='\n', names=['Title'])
dt_false = pd.read_csv('non_clickbait_data.txt', sep='\n', names=['Title'])

dt_true = dt_true.assign(Label = lambda x: True)
dt_false = dt_false.assign(Label = lambda x: False)


In [None]:
## UNCOMENT WHEN NEEDED
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
from sklearn.utils import shuffle
df = pd.concat([dt_true, dt_false], ignore_index=True)

### Tokenizing

In [None]:
df['Tokenized'] = df['Title'].apply(word_tokenize)
df.head(10)

Unnamed: 0,Title,Label,Tokenized
0,Should I Get Bings,True,"[Should, I, Get, Bings]"
1,Which TV Female Friend Group Do You Belong In,True,"[Which, TV, Female, Friend, Group, Do, You, Be..."
2,"The New ""Star Wars: The Force Awakens"" Trailer...",True,"[The, New, ``, Star, Wars, :, The, Force, Awak..."
3,Bill Changing Credit Card Rules Is Sent to Oba...,False,"[Bill, Changing, Credit, Card, Rules, Is, Sent..."
4,"In Hollywood, the Easy-Money Generation Toughe...",False,"[In, Hollywood, ,, the, Easy-Money, Generation..."
5,1700 runners still unaccounted for in UK's Lak...,False,"[1700, runners, still, unaccounted, for, in, U..."


### Eliminating numbers

In [None]:
number = re.compile('[+-]?\d*[.,]?\d+([eE][+-]?\d+)?')
df['TkNoNr'] = list(map(lambda v: [x for x in v if not bool(number.fullmatch(x))], df['Tokenized']))
df['HasNumbers'] = list(map(lambda x, y : len(x) != len(y), df['Tokenized'], df['TkNoNr']))


In [None]:
df.head()

Unnamed: 0,Title,Label,Tokenized,TkNoNr,HasNumbers
0,Should I Get Bings,True,"[Should, I, Get, Bings]","[Should, I, Get, Bings]",False
1,Which TV Female Friend Group Do You Belong In,True,"[Which, TV, Female, Friend, Group, Do, You, Be...","[Which, TV, Female, Friend, Group, Do, You, Be...",False
2,"The New ""Star Wars: The Force Awakens"" Trailer...",True,"[The, New, ``, Star, Wars, :, The, Force, Awak...","[The, New, ``, Star, Wars, :, The, Force, Awak...",False
3,"This Vine Of New York On ""Celebrity Big Brothe...",True,"[This, Vine, Of, New, York, On, ``, Celebrity,...","[This, Vine, Of, New, York, On, ``, Celebrity,...",False
4,A Couple Did A Stunning Photo Shoot With Their...,True,"[A, Couple, Did, A, Stunning, Photo, Shoot, Wi...","[A, Couple, Did, A, Stunning, Photo, Shoot, Wi...",False


# POS Tagging

In [None]:
df['Pos_Tag'] = df['TkNoNr'].apply(nltk.pos_tag)

In [None]:
df.head(10)

Unnamed: 0,Title,Label,Tokenized,TkNoNr,HasNumbers,Pos_Tag
0,Should I Get Bings,True,"[Should, I, Get, Bings]","[Should, I, Get, Bings]",False,"[(Should, MD), (I, PRP), (Get, VB), (Bings, NNS)]"
1,Which TV Female Friend Group Do You Belong In,True,"[Which, TV, Female, Friend, Group, Do, You, Be...","[Which, TV, Female, Friend, Group, Do, You, Be...",False,"[(Which, JJ), (TV, NN), (Female, NNP), (Friend..."
2,"The New ""Star Wars: The Force Awakens"" Trailer...",True,"[The, New, ``, Star, Wars, :, The, Force, Awak...","[The, New, ``, Star, Wars, :, The, Force, Awak...",False,"[(The, DT), (New, NNP), (``, ``), (Star, NNP),..."
3,"This Vine Of New York On ""Celebrity Big Brothe...",True,"[This, Vine, Of, New, York, On, ``, Celebrity,...","[This, Vine, Of, New, York, On, ``, Celebrity,...",False,"[(This, DT), (Vine, NNP), (Of, IN), (New, NNP)..."
4,A Couple Did A Stunning Photo Shoot With Their...,True,"[A, Couple, Did, A, Stunning, Photo, Shoot, Wi...","[A, Couple, Did, A, Stunning, Photo, Shoot, Wi...",False,"[(A, DT), (Couple, NNP), (Did, NNP), (A, NNP),..."
5,How To Flirt With Queer Girls Without Making A...,True,"[How, To, Flirt, With, Queer, Girls, Without, ...","[How, To, Flirt, With, Queer, Girls, Without, ...",False,"[(How, WRB), (To, TO), (Flirt, NNP), (With, IN..."
6,32 Cute Things To Distract From Your Awkward T...,True,"[32, Cute, Things, To, Distract, From, Your, A...","[Cute, Things, To, Distract, From, Your, Awkwa...",True,"[(Cute, JJ), (Things, NNS), (To, TO), (Distrac..."
7,If Disney Princesses Were From Florida,True,"[If, Disney, Princesses, Were, From, Florida]","[If, Disney, Princesses, Were, From, Florida]",False,"[(If, IN), (Disney, NNP), (Princesses, NNPS), ..."
8,What's A Quote Or Lyric That Best Describes Yo...,True,"[What, 's, A, Quote, Or, Lyric, That, Best, De...","[What, 's, A, Quote, Or, Lyric, That, Best, De...",False,"[(What, WP), ('s, VBZ), (A, DT), (Quote, NNP),..."
9,Natalie Dormer And Sam Claflin Play A Game To ...,True,"[Natalie, Dormer, And, Sam, Claflin, Play, A, ...","[Natalie, Dormer, And, Sam, Claflin, Play, A, ...",False,"[(Natalie, NNP), (Dormer, NNP), (And, CC), (Sa..."


## **Adding number of words and sentence as features**

In [None]:
df['NumberWords'] = df['Tokenized'].apply(len)

df['SentenceTokenized'] = df['Title'].apply(sent_tokenize)

df['NumberSentence'] = df['SentenceTokenized'].apply(len)


NameError: ignored

In [None]:
df.head(10)

Unnamed: 0,Title,Label,Tokenized,TkNoNr,HasNumbers,Pos_Tag,NumberWords,SentenceTokenized,NumberSentence
0,Should I Get Bings,True,"[Should, I, Get, Bings]","[Should, I, Get, Bings]",False,"[(Should, MD), (I, PRP), (Get, VB), (Bings, NNS)]",4,[Should I Get Bings],1
1,Which TV Female Friend Group Do You Belong In,True,"[Which, TV, Female, Friend, Group, Do, You, Be...","[Which, TV, Female, Friend, Group, Do, You, Be...",False,"[(Which, JJ), (TV, NN), (Female, NNP), (Friend...",9,[Which TV Female Friend Group Do You Belong In],1
2,"The New ""Star Wars: The Force Awakens"" Trailer...",True,"[The, New, ``, Star, Wars, :, The, Force, Awak...","[The, New, ``, Star, Wars, :, The, Force, Awak...",False,"[(The, DT), (New, NNP), (``, ``), (Star, NNP),...",17,"[The New ""Star Wars: The Force Awakens"" Traile...",1
3,"This Vine Of New York On ""Celebrity Big Brothe...",True,"[This, Vine, Of, New, York, On, ``, Celebrity,...","[This, Vine, Of, New, York, On, ``, Celebrity,...",False,"[(This, DT), (Vine, NNP), (Of, IN), (New, NNP)...",14,"[This Vine Of New York On ""Celebrity Big Broth...",1
4,A Couple Did A Stunning Photo Shoot With Their...,True,"[A, Couple, Did, A, Stunning, Photo, Shoot, Wi...","[A, Couple, Did, A, Stunning, Photo, Shoot, Wi...",False,"[(A, DT), (Couple, NNP), (Did, NNP), (A, NNP),...",18,[A Couple Did A Stunning Photo Shoot With Thei...,1
5,How To Flirt With Queer Girls Without Making A...,True,"[How, To, Flirt, With, Queer, Girls, Without, ...","[How, To, Flirt, With, Queer, Girls, Without, ...",False,"[(How, WRB), (To, TO), (Flirt, NNP), (With, IN...",13,[How To Flirt With Queer Girls Without Making ...,1
6,32 Cute Things To Distract From Your Awkward T...,True,"[32, Cute, Things, To, Distract, From, Your, A...","[Cute, Things, To, Distract, From, Your, Awkwa...",True,"[(Cute, JJ), (Things, NNS), (To, TO), (Distrac...",9,[32 Cute Things To Distract From Your Awkward ...,1
7,If Disney Princesses Were From Florida,True,"[If, Disney, Princesses, Were, From, Florida]","[If, Disney, Princesses, Were, From, Florida]",False,"[(If, IN), (Disney, NNP), (Princesses, NNPS), ...",6,[If Disney Princesses Were From Florida],1
8,What's A Quote Or Lyric That Best Describes Yo...,True,"[What, 's, A, Quote, Or, Lyric, That, Best, De...","[What, 's, A, Quote, Or, Lyric, That, Best, De...",False,"[(What, WP), ('s, VBZ), (A, DT), (Quote, NNP),...",11,[What's A Quote Or Lyric That Best Describes Y...,1
9,Natalie Dormer And Sam Claflin Play A Game To ...,True,"[Natalie, Dormer, And, Sam, Claflin, Play, A, ...","[Natalie, Dormer, And, Sam, Claflin, Play, A, ...",False,"[(Natalie, NNP), (Dormer, NNP), (And, CC), (Sa...",21,[Natalie Dormer And Sam Claflin Play A Game To...,1


### Stemming

In [None]:
from nltk.stem.snowball import EnglishStemmer
# from nltk.stem.wordnet import WordNetLemmatizer
steamer = EnglishStemmer(ignore_stopwords=True)
df['Snowball'] = list(map(lambda v: [steamer.stem(x) for x in v], df['TkNoNr']))

KeyError: ignored

In [None]:
df.head()

NameError: ignored

### Eliminating stop-words

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words
df['NoStopWords'] = list(map(lambda v: [x for x in v if x not in stop_words], df['Snowball']))


## Calculate number of stop words

In [None]:
#thought it reasonable to compare snowball and nostopwords, if order edited and more preprocessing is made this might have to change
df['NumStopWords'] = df['Snowball'].apply(lambda x: len(x)) - df['NoStopWords'].apply(lambda x: len(x))
df.head()

Unnamed: 0,Title,Label,Tokenized,TkNoNr,HasNumbers,Pos_Tag,NumberWords,SentenceTokenized,NumberSentence,Snowball,NoStopWords,NumStopWords
0,Should I Get Bings,True,"[Should, I, Get, Bings]","[Should, I, Get, Bings]",False,"[(Should, MD), (I, PRP), (Get, VB), (Bings, NNS)]",4,[Should I Get Bings],1,"[should, i, get, bing]","[get, bing]",2
1,Which TV Female Friend Group Do You Belong In,True,"[Which, TV, Female, Friend, Group, Do, You, Be...","[Which, TV, Female, Friend, Group, Do, You, Be...",False,"[(Which, JJ), (TV, NN), (Female, NNP), (Friend...",9,[Which TV Female Friend Group Do You Belong In],1,"[which, tv, femal, friend, group, do, you, bel...","[tv, femal, friend, group, belong]",4
2,"The New ""Star Wars: The Force Awakens"" Trailer...",True,"[The, New, ``, Star, Wars, :, The, Force, Awak...","[The, New, ``, Star, Wars, :, The, Force, Awak...",False,"[(The, DT), (New, NNP), (``, ``), (Star, NNP),...",17,"[The New ""Star Wars: The Force Awakens"" Traile...",1,"[the, new, ``, star, war, :, the, forc, awaken...","[new, ``, star, war, :, forc, awaken, '', trai...",6
3,"This Vine Of New York On ""Celebrity Big Brothe...",True,"[This, Vine, Of, New, York, On, ``, Celebrity,...","[This, Vine, Of, New, York, On, ``, Celebrity,...",False,"[(This, DT), (Vine, NNP), (Of, IN), (New, NNP)...",14,"[This Vine Of New York On ""Celebrity Big Broth...",1,"[this, vine, of, new, york, on, ``, celebr, bi...","[vine, new, york, ``, celebr, big, brother, ''...",4
4,A Couple Did A Stunning Photo Shoot With Their...,True,"[A, Couple, Did, A, Stunning, Photo, Shoot, Wi...","[A, Couple, Did, A, Stunning, Photo, Shoot, Wi...",False,"[(A, DT), (Couple, NNP), (Did, NNP), (A, NNP),...",18,[A Couple Did A Stunning Photo Shoot With Thei...,1,"[a, coupl, did, a, stun, photo, shoot, with, t...","[coupl, stun, photo, shoot, babi, learn, inope...",9


### Eliminating some punctuations signs

In [None]:
df['Last'] = list(map(lambda v: [x for x in v if x not in [',', '.', '_']], df['NoStopWords']))


### TF-IDF

In [None]:
from collections import Counter

global_freq = Counter([elem for v in df['Last'] for elem in v])


df['Freq'] = list(map(lambda v: Counter(v), df['Last']))


In [None]:
# dt_true['TF'] = list(map(lambda v, f: [f[x] / sum(f.values()) for x in v], dt_true['Last'], dt_true['Freq']))
# dt_false['TF'] = list(map(lambda v, f: [f[x] / sum(f.values()) for x in v], dt_false['Last'], dt_false['Freq']))

*Very* costly 3m

In [None]:
# df = {k: 0 for k in global_freq.keys()}

# for word in global_freq.keys():
#     df[word] = sum([word in doc for doc in dt_true['Last']]) + sum([word in doc for doc in dt_false['Last']])

In [None]:
wordlist = global_freq.keys()
inverse = {index: word for word, index in enumerate(wordlist)}
nr_words = len(wordlist)
df['IDF'] = list(map(lambda v: [0]*nr_words, df['Last']))
df['TF'] = list(map(lambda v: [0]*nr_words, df['Last']))


#### Term frequency

In [None]:
for doc_frq, tf in zip(df['Freq'], df['TF']):
    s = sum(doc_frq.values())
    for word, freq in doc_frq.items():
        tf[inverse[word]] = freq / s



#### Inverse document frequency

In [None]:
nr_docs = len(df['Last']) + len(df['Last'])

In [None]:
term_doc_frq = {k: 0 for k in global_freq.keys()}

for doc in df['Last']:
    f = dict()
    for word in doc:
        if not word in f:
            term_doc_frq[word] += 1
            f[word] = 1



In [None]:
from math import log
idf = [0] * nr_words
for word in wordlist:
    idf[inverse[word]] = log(nr_docs / (1 + term_doc_frq[word]))


#### Final TF*IDF

In [None]:
df['TF_IDF'] = list(map(lambda v: [0]*nr_words, df['Last']))


In [None]:
for doc_frq, tf, tf_idf in zip(df['Freq'], df['TF'], df['TF_IDF']):
    for word, freq in doc_frq.items():
        index = inverse[word]
        tf_idf[index] = tf[index] * idf[index]



In [None]:
df['TF_IDF']



NameError: ignored

### Sklearn preprocessing

### Readability feature

In [None]:
#import the library
!pip install https://github.com/andreasvc/readability/tarball/master
import readability
#!pip install py-readability-metrics
#!python -m nltk.downloader punkt
#from readability import Readability

SyntaxError: ignored

In [None]:
import readability
text = """
This is an example sentence. Note that tokens will be separated by spaces
and sentences by newlines.

This is the second paragraph."""
tokenized = word_tokenize(text)
print(tokenized)
results = readability.getmeasures(tokenized, lang='en')
print(results['readability grades']['FleschReadingEase'])


['This', 'is', 'an', 'example', 'sentence', '.', 'Note', 'that', 'tokens', 'will', 'be', 'separated', 'by', 'spaces', 'and', 'sentences', 'by', 'newlines', '.', 'This', 'is', 'the', 'second', 'paragraph', '.']


AttributeError: ignored

In [None]:
#Readability(df['Title'].astype("string"))
#df['Read_score'] = Readability(df['Title'].astype("string")).flesch_kinkaid()

df['Readability'] = df['Tokenized'].apply(getmeasures())
df['Read_score'] = df['Readability'].apply(flesch_kinkaid)
    

NameError: ignored

In [None]:
df.head()

NameError: ignored

# More customizable function
didn't fully check for bugs

In [None]:
def preprocess_tfidf(input_data):

    # substitute numbers by a a special token
    p = re.compile(r'\b[+-]?\d*[.,]?\d+([eE][+-]?\d+)?\b')
    for i in range(len(input_data)):
        input_data[i] = p.sub('NUMBER_SPECIAL_TOKEN', input_data[i])
    data = pd.DataFrame({'text': input_data})

    data.text = data.text.apply(word_tokenize)  # tokenization

    # eliminating numbers
    number = re.compile('[+-]?\d*[.,]?\d+([eE][+-]?\d+)?')
    data['has_numbers'] = list(
        map(lambda v: len([x for x in v if bool(number.fullmatch(x))]) != 0, data.text))
    data.text = list(
        map(lambda v: [x for x in v if not bool(number.fullmatch(x))], data.text))


    # eliminating some punctuation signs
    data.text = list(
        map(lambda v: [x for x in v if x not in [',', '.', '_']], data.text))

    steamer = EnglishStemmer(ignore_stopwords=True)  # steming
    data.text = list(map(lambda v: [steamer.stem(x) for x in v], data.text))

    stop_words = stopwords.words('english')  # eliminating stop-words
    data.text = list(
        map(lambda v: [x for x in v if x not in stop_words], data.text))

    global_freq = Counter([elem for v in data.text for elem in v])  # tf_idf
    data['freq'] = list(map(lambda v: Counter(v), data.text))
    wordlist = global_freq.keys()
    inverse = {index: word for word, index in enumerate(wordlist)}
    nr_words = len(wordlist)
    data['tf_idf'] = list(map(lambda v: [0]*nr_words, data.text))
    nr_docs = len(data.text)
    term_doc_frq = {k: 0 for k in global_freq.keys()}
    for text in data.text:
        f_aux = dict()
        for word in text:
            if not word in f_aux:
                term_doc_frq[word] += 1
                f_aux[word] = 1
    idf = [0] * nr_words
    for word in wordlist:
        idf[inverse[word]] = log(nr_docs / (1 + term_doc_frq[word]))

    for doc_frq, tf_idf in zip(data.freq, data.tf_idf):
        freq_sum = sum(doc_frq.values())
        for word, freq in doc_frq.items():
            index = inverse[word]
            tf_idf[index] = round(freq / freq_sum * idf[index], 6)
    return data


In [None]:
res = preprocess_tfidf(
    ['Hello I will be running at the sport festival this year swill you be there?, f#, asfd, 3',
     'asf, 2, 4, 3fas, 3:3, :',
     '32.3e-2, 43',
     'hola buenos dias'
     ])
corpus = [
    'This is the first document. 23',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
res2 = preprocess_tfidf(corpus)
print(res2)



                                      text  has_numbers  \
0  [first, document, number_special_token]        False   
1             [document, second, document]        False   
2                             [third, one]        False   
3                     [first, document, ?]        False   

                                                freq  \
0  {'first': 1, 'document': 1, 'number_special_to...   
1                       {'document': 2, 'second': 1}   
2                             {'third': 1, 'one': 1}   
3                {'first': 1, 'document': 1, '?': 1}   

                                  tf_idf  
0  [0.095894, 0.0, 0.231049, 0, 0, 0, 0]  
1         [0, 0.0, 0, 0.231049, 0, 0, 0]  
2    [0, 0, 0, 0, 0.346574, 0.346574, 0]  
3  [0.095894, 0.0, 0, 0, 0, 0, 0.231049]  


### TF_IDF sklearn

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()

X.toarray()

tf_idf_matrix = pd.DataFrame(
    vectorizer.fit_transform(corpus).toarray(), 
    columns=vectorizer.get_feature_names()
)

print(tf_idf_matrix)

# print(X.shape)


AttributeError: 'TfidfVectorizer' object has no attribute 'get_feature_names_out'

# This function (recomended)

In [None]:
# takes a list of strings as parameters
def pre_tfidf2(data):
    p = re.compile(r'\b[+-]?\d*[.,]?\d+([eE][+-]?\d+)?\b')
    for i in range(len(data)):
        data[i] = p.sub('NUMBER_SPECIAL_TOKEN', data[i])
    # vectorizer = TfidfVectorizer(stop_words='english')
    vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))
    X = vectorizer.fit_transform(data)
    ## UNCOMMENT FOR SMALL EXAMPLES TO PRINT AND UNDERSTAND A BIT THE RESULTS 
    # tf_idf_matrix = pd.DataFrame(
    #     vectorizer.fit_transform(data).toarray(),
    #     columns=vectorizer.get_feature_names()
    # )
    # print(tf_idf_matrix)
    return X


corpus = [
    'This is the first document. 23',
    'This document 1.23e-120 is the second document 34.',
    'And this is the third one.',
    'Is this the first document?',
]

r = pre_tfidf2(corpus)


In [None]:
r.toarray()

In [None]:
y = list(df['Title'])
X = list(df['Label'])


In [None]:
q = pre_tfidf2(y)

In [None]:
q.toarray()

In [None]:
q.get_shape()

In [None]:
32000 * 22335

In [None]:
df.size

In [None]:
## Note: xgboost

# postag inforamtion
# reedability score
# number of words/sentece or letters/sentes

# mtld score
# number of stop words

# logistic regresion




# SVM

In [None]:
#from sklearn.svm import SVC
#SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
#SVM.fit(x_train,y_train)

In [None]:
#predictions_SVM = SVM.predict(x_train)

In [None]:
#print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, y_test)*100)

## Export dataset

In [None]:
# file too large ! 4.7 GB !
#df.to_csv('./dataPreProcessed.csv') 
df.head(20)

# MTDL (Measure of Textual Lexical Diversity)

The MTLD method is based on the type-token ratio of a text, i.e. the ratio of the number of distinct words--or more generally text units--to the total number of units. Leaving aside the nasty details, the idea is to compute the average length of a sequence of contiguous text units maintaining a type-token ratio above a specified threshold, which is set to 0.72 by McCarthy and Jarvis (2010). They call such a sequence a 'factor' of the text.

In [None]:
!pip install lexical-diversity
from lexical_diversity import lex_div as ld
from nltk.stem.snowball import EnglishStemmer



In [None]:
df = pd.concat([dt_true, dt_false], ignore_index=True)

In [None]:
y = df['Title'].to_list()
X = df['Label']
p = re.compile(r'\b[+-]?\d*[.,]?\d+([eE][+-]?\d+)?\b')
for i in range(len(y)):
  y[i] = y[i].lower()
  y[i] = number.sub('NUMBER_SPECIAL_TOKEN', y[i])
tk = list(map(word_tokenize, y))
mtld = list(map(ld.mtld, tk))

In [None]:
# display(list(zip(y, mtld)))