## This file does below:
1. Import dataset and generate/add features
2. Run different classifier models and predict the authenticity of a randomly selected article

## Import libraries and packages

In [1]:
## Import libraries & Packages

# Data structure and visualization
import pandas as pd
import numpy as np
import os
import dill as pickle
%matplotlib inline


# Machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import validation_curve
from sklearn.preprocessing import MaxAbsScaler

## Import data (balanced_data.csv)

In [2]:
path = os.path.join('data', 'balanced_data.csv')
total_df = pd.read_csv(path, usecols=[3,5,6])
total_df.tail()

Unnamed: 0,title,text,authenticity
2941,Evangelicals Pray For President In W.H. – Demo...,Evangelicals Pray For President In W.H. – Demo...,1
2942,"Vaccines: Good or Bad?, Part 8","Vaccines: Good or Bad?, Part 8\n\nThe more we ...",1
2943,Father PUNISHES 10-Year-Old Daughter by Strand...,"Christopher Charles Watson, from Kingman, Ariz...",1
2944,Delta purchases 10% stake in Air France-KLM,Delta Air Lines (DAL) and China Eastern Airlin...,0
2945,Can forgotten rubella children of the '60s hol...,"Brooklyn, New York (CNN) One side of the bedro...",0


## Dataset information

## Import additional features
* Author exists (1) / not exists (0)
* captital rates of title text, normalized to the average of caprate_title
* Rate of exaggerating punctuations [!,?,:,-] in title text, normalized to the average of exagg_puct_title

In [3]:
path = os.path.join('data', 'additional_features.csv')
addfeat_df = pd.read_csv(path, usecols=[1,2,3])
addfeat_df.head()

Unnamed: 0,author,caprate_title,exagg_puct_title
0,1,1.065457,0.0
1,0,1.891909,9.623847
2,0,1.251912,0.0
3,0,0.561081,0.0
4,1,0.745185,0.0


## Split the whole dataset into train, cv, test datasets 

* X_( ) = text of article body, before processed
* af_( ) = additional features, already processed
* Combining X_ and af_ first, and split the data together, and separate them again so that text can be processed separately

In [4]:
Y = total_df['authenticity']
addfeat_df['text'] = total_df.text
addfeat_df.head()
Xaf = addfeat_df
Xaf_train, Xaf_cv, Y_train, Y_cv = train_test_split(Xaf, Y, test_size=0.2, random_state=42)
X_train, af_train = Xaf_train.text, Xaf_train.drop('text', axis = 1)
X_cv, af_cv = Xaf_cv.text, Xaf_cv.drop('text', axis = 1)


## Generate features from article body text

In [5]:
import nltk
import string
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed



def tokenize_stemmer(text):
    tokens = nltk.word_tokenize(text)
    # option to include punctuation or not
    #tokens = [i for i in tokens if i not in string.punctuation]
    stems = stem_tokens(tokens, stemmer)
    return stems

pickle.dump(stem_tokens, open('stem_tokens.pkl', 'wb'), protocol=2)
pickle.dump(tokenize_stemmer, open('tokenize_stemmer.pkl', 'wb'), protocol=2)

### Method 2: Use TF-IDF to generate features for the text body of news¶

In [6]:
from sklearn.feature_extraction import text 
my_additional_stop_words = ['abc', 'nbc', 'npr', 'cnn', 'reuters', 'fox', 
                            'bbc', 'cbs', 'newyorker', 'msnbc', 'politico', 'nytimes',
                            'sputniknews', 'lastdeplorables', 'readconservatives', 'wordpress',
                            'infostormer', 'Americannews', 'ABCnews', 'nationonenews', 'majorthoughts',
                            'interestingdailynews', 'donaldtrumppotus45', 'newsbbc', 'beforeitsnews', 
                            'krbcnews', 'Conservativedailypost', 'thedcgazette', 'Americanoverlook', 
                            'CivicTribune', 'openmagazines', 'politicono', 'bizstandardnews', 'president45donaldtrump',
                            'nbc', 'AmericanFlavor', 'prntly', 'bipartisanreport', 'americanfreepress', 
                            'ladylibertysnews', 'politicalo', 'now8news', '24wpn', 'pamelageller', 
                            'ddsnewstrend', 'Bighairynews', 'redcountry', 'newswithviews', 'Clashdaily', 
                            'aurora-news', 'nephef', 'local31news', 'realnewsrightnow', 'reagancoalition',
                            'reuter', 'sputnik', 
                            'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday' ]
my_stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(tokenizer=tokenize_stemmer,stop_words=my_stop_words,ngram_range=(1, 2))
tfidf.fit(X_train)
pickle.dump(tfidf, open('tfidf.sav', 'wb'), protocol=3)

In [8]:
X_train_tot = tfidf.transform(X_train)
X_cv_tot = tfidf.transform(X_cv)

X_train_tot.shape

(2356, 630691)

## Feature selection

In [9]:
from sklearn.feature_selection import SelectPercentile, SelectKBest, f_classif, chi2, SelectFromModel

In [10]:
# select top 10% features
# selector = SelectPercentile(f_classif, percentile = 10) 

# select top 25000 features
#selector = SelectKBest(chi2, k = 10000) 


# select from model
lsvc = LinearSVC(C=9000, penalty="l1", dual=False)#.fit(X_train_tot, Y_train)
selector = SelectFromModel(lsvc, prefit=False)

selector.fit(X_train_tot, Y_train)
pickle.dump(selector, open('selector.sav', 'wb'), protocol=3)
X_train_selected = selector.transform(X_train_tot)
X_cv_selected = selector.transform(X_cv_tot)

print("X_train_selected shape: ", X_train_selected.shape)
print("X_cv_selected shape: ", X_cv_selected.shape)

X_train_selected shape:  (2356, 2535)
X_cv_selected shape:  (590, 2535)


## Set train, test, cv data (choose one method)

### Adding additional features

In [11]:
import scipy as sp

def concat_3features(df):
    author = np.array([[x] for x in df.author])
    caprate = np.array([[x] for x in df.caprate_title])
    exagg =  np.array([[x] for x in df.exagg_puct_title])
    return  np.concatenate((author, caprate, exagg), axis=1)

# add additional feature names
feat_names_add = ['author', 'title_capitalization', 'exclamation_in_title']

### use selected and additional features

In [12]:
X_train_sel_add = sp.sparse.hstack((X_train_selected, concat_3features(af_train)))
X_cv_sel_add = sp.sparse.hstack((X_cv_selected, concat_3features(af_cv)))

print("X_train_sel_add shape: ", X_train_sel_add.shape)

print("X_cv_sel_add shape: ", X_cv_sel_add.shape)

X_train_dtm = X_train_sel_add
X_cv_dtm = X_cv_sel_add


# path_1 = os.path.join('data','X_train_sel_add.csv')
# X_train_sel_add.to_csv(path_1)
# path_2 = os.path.join('data','Y_train_sel_add.csv')
# Y_train_sel_add.to_csv(path_2)

X_train_sel_add shape:  (2356, 2538)
X_cv_sel_add shape:  (590, 2538)


## Test different machine learning classifier models

In [13]:
# validation test & Learning curve

def valid_test(model, param, param_candidates):
    train_scores, valid_scores = validation_curve(model, X_cv_dtm, Y_cv, param, param_candidates)
    avg_ts, avg_vs = train_scores.mean(axis = 1), valid_scores.mean(axis = 1)
    sd_ts, sd_vs = train_scores.std(axis = 1), valid_scores.std(axis = 1)
    vs_max_ix = np.argmax(avg_vs)
    best = param_candidates[vs_max_ix]
    return best

### 1. Logistic Regression

In [14]:
best_C = valid_test(LogisticRegression(), "C", np.logspace(2, 4, 10))
LR = LogisticRegression(C = best_C)
# #LR = LogisticRegression()
LR.fit(X_train_dtm, Y_train)
# Y_test_pred_LR = LR.predict(X_test_dtm)
pickle.dump(LR, open('LR.sav', 'wb'), protocol=3)