In [1]:
#Read in data
import os
import pandas as pd

articles = pd.DataFrame()

for f in os.listdir('../scrape/data/'):
    articles = articles.append(pd.read_csv('../scrape/data/' + f))
    
articles.head()

Unnamed: 0.1,Unnamed: 0,authors,bias,keywords,source,text,title
0,0,['Lauren Gambino'],Bias: Lean Left,"['healthcare', 'replacement', 'congress', 'mea...",The Guardian,Republicans ready to dismantle Obamacare amid ...,Congress approves initial measures to repeal A...
1,1,"['Tom Lobianco', 'Deirdre Walsh', 'Tal Kopan']",Bias: Center,"['repealing', 'cut', 'replacement', 'republica...",CNN,Washington (CNN) The House of Representatives ...,House takes first step towards repealing Obama...
2,2,"['The Washington Times Http', 'Tom Howell Jr.']",Bias: Lean Right,"['way', 'house', 'health', 'care', 'budget', '...",Washington Times,The House passed a 2017 budget Friday that lay...,House passes budget to begin Obamacare repeal
3,3,"[""Cortney O'Brien"", 'Christine Rousselle', 'Ju...",Bias: Right,"['agency', 'department', 'doj', 'chicago', 'pr...",Townhall,Garry McCarthy was the superintendent of the C...,Former Chicago Police Superintendent: The DOJ ...
4,4,['Mark Berman Covers National News For The Was...,Bias: Lean Left,"['city', 'scathing', 'force', 'pattern', 'just...",Washington Post,"During a press conference on Jan. 13, Attorney...",Chicago police officers have pattern of using ...


In [2]:
BIAS_NAME_TO_SCORE = {
    'Bias: Left': 0,
    'Bias: Lean Left': .25,
    'Bias: Center': .5,
    'Bias: Mixed': .5,
    'Bias: Lean Right': .75,
    'Bias: Right': 1
}

articles = articles[articles['bias'] != 'Bias: Not rated']
articles['bias_score'] = articles["bias"].map(BIAS_NAME_TO_SCORE)

In [3]:
articles = articles.loc[articles['bias_score'] != .5]
articles.loc[articles['bias_score'] > .5, 'bias_score'] = 1
articles.loc[articles['bias_score'] < .5 , 'bias_score'] = 0

In [4]:
articles['bias_score'].value_counts()

0.0    338
1.0    320
Name: bias_score, dtype: int64

In [5]:
articles.head(5)

Unnamed: 0.1,Unnamed: 0,authors,bias,keywords,source,text,title,bias_score
0,0,['Lauren Gambino'],Bias: Lean Left,"['healthcare', 'replacement', 'congress', 'mea...",The Guardian,Republicans ready to dismantle Obamacare amid ...,Congress approves initial measures to repeal A...,0.0
2,2,"['The Washington Times Http', 'Tom Howell Jr.']",Bias: Lean Right,"['way', 'house', 'health', 'care', 'budget', '...",Washington Times,The House passed a 2017 budget Friday that lay...,House passes budget to begin Obamacare repeal,1.0
3,3,"[""Cortney O'Brien"", 'Christine Rousselle', 'Ju...",Bias: Right,"['agency', 'department', 'doj', 'chicago', 'pr...",Townhall,Garry McCarthy was the superintendent of the C...,Former Chicago Police Superintendent: The DOJ ...,1.0
4,4,['Mark Berman Covers National News For The Was...,Bias: Lean Left,"['city', 'scathing', 'force', 'pattern', 'just...",Washington Post,"During a press conference on Jan. 13, Attorney...",Chicago police officers have pattern of using ...,0.0
6,6,[],Bias: Right,"['relationship', 'think', 'ill', 'relations', ...",CBN,President-elect Donald Trump has responded to ...,Tough Times ahead for White House-Media Relati...,1.0


In [6]:
articles = articles.loc[articles['text'].notnull()]

In [7]:
from nltk.tokenize import RegexpTokenizer

def clean_text(text):
    return ' '.join(
        RegexpTokenizer(r'\w+').tokenize(
        str.lower(
            text
            )
        )
    )

articles['text'] = articles['text'].apply(clean_text)

In [8]:
from __future__ import print_function

from pprint import pprint
from time import time
import logging

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

from sklearn.grid_search import GridSearchCV

import numpy as np

print(__doc__)


# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')


Automatically created module for IPython interactive environment


In [9]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
               #'tfidf__use_idf': (True, False),
               #'clf__alpha': (1e-2, 1e-3),
}

text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42)),
 ])

grid_search = GridSearchCV(text_clf, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in text_clf.steps])
print("parameters:")

pprint(parameters)
t0 = time()

grid_search.fit(articles['text'], articles['bias_score'])

print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")

best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'vect__ngram_range': [(1, 1), (1, 2)]}
Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Done   7 out of   6 | elapsed:    8.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   7 out of   6 | elapsed:   11.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   7 out of   6 | elapsed:   16.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   7 out of   6 | elapsed:   16.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   7 out of   6 | elapsed:   17.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   17.0s finished


done in 20.644s

Best score: 0.721
Best parameters set:
	vect__ngram_range: (1, 2)


In [10]:
#Dump file into pickle object
from sklearn.externals import joblib
joblib.dump(grid_search, 'text_clf.pkl')

['text_clf.pkl',
 'text_clf.pkl_01.npy',
 'text_clf.pkl_02.npy',
 'text_clf.pkl_03.npy',
 'text_clf.pkl_04.npy',
 'text_clf.pkl_05.npy',
 'text_clf.pkl_06.npy',
 'text_clf.pkl_07.npy',
 'text_clf.pkl_08.npy']

In [135]:
list(zip(*grid_search.predict_proba(y_train)))[1] #These are the predictions in probability.

(0.16666029420499082,
 0.45001012670211382,
 0.32954048979897832,
 0.26576671160649273,
 0.26347487636080386,
 0.35289495418404238,
 0.52898080544576476,
 0.54356627777610544,
 0.48478954690145343,
 0.30264724331482984,
 0.30119403583039195,
 0.23645595802604766,
 0.16546944976328454,
 0.25135593441902054,
 0.36785578186346962,
 0.50726598483715191,
 0.17970131404671297,
 0.36949947670914873,
 0.50532084741285288,
 0.64102825379701112)