In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_columns',100)
pd.set_option('display.max_rows',100)
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import re

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score


from wordcloud import WordCloud, STOPWORDS

import string
from matplotlib import style

  import pandas.util.testing as tm


### In this section, I merged all my datasets and created:
1. 2019/2020 CB vs Non-CB headline dataset (20,000)
    - non-CB data pulled from API and webscraped from NY Times, The Washington Post, The Guardian, Bloomberg, and Reuters 
    - CB data pulled from 6 Twitter profiles (Buzzfeed, Examiner, ThePoliticalInsider, Upworthy, BoredPanda, The Odyssey
    - features - text, class, date
2. 2007 - 2016 CB vs Non-CB headline dataset (30,000)
    - pulled from Kaggle 
    - Non-CB: NY Times, The Guardian, The Hindu, Wikinews
    - CB: BuzzFeed, Upworthy, ViralNova, Thatscoop, Scoopwhoop and ViralStories
    - text, class
3. One total dataset combining both (50,000)
    - text headlines, class

In [5]:
#reading in datasets
clickbait_df=pd.read_csv('clickbait_final.csv',index_col=0)
nytimes_df=pd.read_csv('nytimes_data_final.csv',index_col=0)
guardian_df=pd.read_csv('guardian_headlines.csv',index_col=0)
twp_df=pd.read_csv('TWP_scraped.csv',index_col=0)
reuters_df=pd.read_csv('reuters_100.csv',index_col=0)
bloomberg_df=pd.read_csv('bloomberg_100.csv',index_col=0)

In [56]:
#labeling class to 1 for all clickbait headlines
clickbait_df['class']=1
clickbait_df.shape

(11116, 3)

In [57]:
#labeling 'class' to 0 for non clickbait headlines
nytimes_df['class']=0
nytimes_df.shape

(5299, 3)

In [58]:
guardian_df['class']=0
guardian_df.shape

(3400, 3)

In [36]:
twp_df.rename(columns={'title':"text",'published':'date'},inplace=True)
twp_df['class']=0

In [37]:
reuters_df['class']=0

In [38]:
bloomberg_df['class']=0

In [39]:
all_headlines_df=pd.concat([clickbait_df,nytimes_df,guardian_df,twp_df,reuters_df,bloomberg_df])

In [54]:
all_headlines_df.shape

(20172, 3)

In [60]:
#all_headlines_df.to_csv('2019_2020_all_headlines.csv')

In [71]:
all_headlines_df.drop(columns='date',inplace=True)

In [61]:
dataset2=pd.read_csv('2007_2016_headline_data.csv')

In [68]:
dataset2.rename(columns={'headline':'text','clickbait':'class'},inplace=True)

In [69]:
dataset2

Unnamed: 0,text,class
0,Should I Get Bings,1
1,Which TV Female Friend Group Do You Belong In,1
2,"The New ""Star Wars: The Force Awakens"" Trailer...",1
3,"This Vine Of New York On ""Celebrity Big Brothe...",1
4,A Couple Did A Stunning Photo Shoot With Their...,1
...,...,...
31995,"To Make Female Hearts Flutter in Iraq, Throw a...",0
31996,"British Liberal Democrat Patsy Calton, 56, die...",0
31997,Drone smartphone app to help heart attack vict...,0
31998,"Netanyahu Urges Pope Benedict, in Israel, to D...",0


In [72]:
df = pd.concat([all_headlines_df,dataset2])

In [74]:
df.shape

(52172, 2)

In [75]:
df['class'].value_counts()

1    27115
0    25057
Name: class, dtype: int64

In [79]:
#df.to_csv('total_headlines.csv')


In [84]:
df

Unnamed: 0,text,class
0,Trey Gowdy just humiliated Adam Schiff in fron...,1
1,60 Netflix Titles Leaving In July 2020,1
2,Learn how to make a green grape taste like a j...,1
3,The New July Netflix Titles Are Here And There...,1
4,The Courts Say Sex Discrimination Laws Protect...,1
...,...,...
31995,"To Make Female Hearts Flutter in Iraq, Throw a...",0
31996,"British Liberal Democrat Patsy Calton, 56, die...",0
31997,Drone smartphone app to help heart attack vict...,0
31998,"Netanyahu Urges Pope Benedict, in Israel, to D...",0


### In this section, I process the dataset for EDA and create additional features: 

In [2]:
df = pd.read_csv('total_headlines.csv')

In [177]:
#make text lowercase
df['text']=df['text'].apply(lambda x: x.lower())

In [3]:
#function to remove punctuation and non-alphabetical characters and links
def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\w*\d\w*', ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('  ', ' ', text)
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', text)
    text = re.sub('\[.*?\]', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    return text

In [331]:
#function to find if string contains a question and if so, update new feature with a 1 for yes or 0 for no
question_words = ['who','what','where','why','when','whose','whom','would','will','how','which','should','could']

def contains_question(headline):
    if "?" in headline or headline.startswith(('who','what','where','why','when','whose','whom','would','will','how','which','should','could','did','do')):
        return 1
    else: 
        return 0

df['question_6']=df['text'].apply(contains_question)


In [332]:
#test
#df['text'][0].startswith(('who','what','where','why','when','whose','whom','would','will','how','which','should','could','trey'))

In [337]:
df.rename(columns={'question_6':'question'},inplace=True)

In [340]:
df['question'].value_counts()

0    47388
1     4784
Name: question, dtype: int64

In [341]:
#create function to find if headline contains '!' and create new feature with 1 for yes and 0 for no
def contains_exclamation(headline):
    if "!" in headline: 
        return 1
    else: 
        return 0
df['exclamation']=df['text'].apply(contains_exclamation)

In [344]:
df.exclamation.value_counts()

0    51614
1      558
Name: exclamation, dtype: int64

In [None]:
#clean headlines to remove puncuation and links and then create feature to count words of each (before removing stop words)

#create feature that counts stop words in a headline (before removing stopwords)

#do eda for word frequencies and comparison of classses
