In [0]:
import requests
import pandas as pd
import json
pd.set_option('display.max_colwidth', 200)

## Get Stories from Pocket

In [0]:
auth_params = {'consumer_key': 'CONSUMER_KEY', 'redirect_uri': 'https://twitter.com/acombs'}

In [0]:
tkn = requests.post('https://getpocket.com/v3/oauth/request', data=auth_params)

In [0]:
tkn.content

In [0]:
usr_params = {'consumer_key':'CONSUMER_KEY', 'code': 'CODE'}

In [0]:
usr = requests.post('https://getpocket.com/v3/oauth/authorize', data=usr_params)

In [0]:
usr.content

### Get 'no' stories

In [0]:
no_params = {'consumer_key': 'CONSUMER_KEY',
'access_token': 'SOME_SUPER_LONG_TOKEN',
'tag': 'n'}

In [0]:
no_result = requests.post('https://getpocket.com/v3/get', data=no_params)

In [0]:
no_result.text

In [0]:
no_jf = json.loads(no_result.text)

In [0]:
no_jd = no_jf['list']

In [0]:
no_jd

In [0]:
no_urls=[]
for i in no_jd.values():
    no_urls.append(i.get('resolved_url'))

In [0]:
no_urls

In [0]:
len(no_urls)

In [0]:
no_uf = pd.DataFrame(no_urls, columns=['urls'])

In [0]:
no_uf

In [0]:
no_uf = no_uf.assign(wanted = lambda x: 'n')

In [0]:
no_uf

### Get 'yes' stories

In [0]:
ye_params = {'consumer_key': 'CONSUMER_KEY',
'access_token': 'SOME_SUPER_LONG_KEY',
'tag': 'y',
'state': 'archive'}

In [0]:
yes_result = requests.post('https://getpocket.com/v3/get', data=yes_params)

In [0]:
yes_result.text

In [0]:
yes_jf = json.loads(yes_result.text)

In [0]:
yes_jd = yes_jf['list']

In [0]:
yes_jf

In [0]:
yes_urls=[]
for i in yes_jd.values():
    yes_urls.append(i.get('resolved_url'))

In [0]:
len(yes_urls)

In [0]:
yes_urls

In [0]:
yes_uf = pd.DataFrame(yes_urls, columns=['urls'])

In [0]:
yes_uf

In [0]:
yes_uf = yes_uf.assign(wanted = lambda x: 'y')

In [0]:
yes_uf

In [0]:
df = pd.concat([yes_uf, no_uf])

In [0]:
df.dropna(inplace=1)

In [0]:
df

## Download Articles to Run Through Model

In [0]:
import urllib

In [0]:
def get_html(x):
    qurl = urllib.parse.quote(x)
    rhtml = requests.get('https://api.embedly.com/1/extract?url=' + qurl + '&key=SOME_KEY')
    ctnt = json.loads(rhtml.text).get('content')
    return ctnt

In [0]:
df.loc[:,'html'] = df['urls'].map(get_html)

In [0]:
df.dropna(inplace=1)

In [0]:
df

### Extract the text

In [0]:
from bs4 import BeautifulSoup

In [0]:
def get_text(x):
    soup = BeautifulSoup(x, 'lxml')
    text = soup.get_text()
    return text

In [0]:
df.loc[:,'text'] = df['html'].map(get_text)

In [0]:
df

# Implement Tfid Vectorization & Fit Model

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [0]:
vect = TfidfVectorizer(ngram_range=(1,3), stop_words='english', min_df=3)

In [0]:
tv = vect.fit_transform(df['text'])

In [0]:
clf = LinearSVC()

In [0]:
model = clf.fit(tv, df['wanted'])

## Pull New Articles from Google Drive Sheet to Evaluate

In [0]:
import gspread
from oauth2client.client import SignedJwtAssertionCredentials

In [0]:
json_key = json.load(open(r'/Users/alexcombs/Downloads/API_KEY.json'))
scope = ['https://spreadsheets.google.com/feeds']
credentials = SignedJwtAssertionCredentials(json_key['client_email'], json_key['private_key'].encode(), scope)
gc = gspread.authorize(credentials)

In [0]:
# must share with client_email in json api key file
ws = gc.open("NewStories")

In [0]:
sh = ws.sheet1

In [0]:
zd = list(zip(sh.col_values(2),sh.col_values(3), sh.col_values(4)))

In [0]:
zf = pd.DataFrame(zd, columns=['title','urls','html'])

In [0]:
zf.replace('', pd.np.nan, inplace=True)
zf.dropna(inplace=True)

In [0]:
zf

In [0]:
zf.loc[:,'text'] = zf['html'].map(get_text)

In [0]:
zf.reset_index(drop=True, inplace=True)

In [0]:
test_matrix = vect.transform(zf['text'])

In [0]:
test_matrix

In [0]:
results = pd.DataFrame(model.predict(test_matrix), columns=['wanted'])

In [0]:
results

In [0]:
rez = pd.merge(results,zf, left_index=True, right_index=True)

In [0]:
rez

In [0]:
for i, w, t in zip(rez[rez['wanted']=='y'].index, rez[rez['wanted']=='y']['wanted'], rez[rez['wanted']=='y']['title']):
    print(i, w, t)

In [0]:
change_to_no = [130, 145, 148, 163, 178, 199, 219, 222, 223, 226, 235, 279, 348, 357, 427, 440, 542, 544, 546, 568, 614, 619, 660, 668, 679, 686, 740, 829]

In [0]:
for i, w, t in zip(rez[rez['wanted']=='n'].index, rez[rez['wanted']=='n']['wanted'], rez[rez['wanted']=='n']['title']):
    print(i, w, t)

In [0]:
change_to_yes = [0, 9, 29, 35, 42, 71, 110, 190, 319, 335, 344, 371, 385, 399, 408, 409, 422, 472, 520, 534, 672]

In [0]:
rez

In [0]:
for i in rez.iloc[change_to_yes].index:
    rez.iloc[i]['wanted'] = 'y'

In [0]:
for i in rez.iloc[change_to_no].index:
    rez.iloc[i]['wanted'] = 'n'

In [0]:
rez

In [0]:
df

In [0]:
combined = pd.concat([df[['wanted', 'text']], rez[['wanted', 'text']]])

In [0]:
combined

In [0]:
tvcomb = vect.fit_transform(combined['text'], combined['wanted'])

In [0]:
model = clf.fit(tvcomb, combined['wanted'])

In [0]:
model

In [0]:
import pickle

In [0]:
pickle.dump(model, open(r'/Users/alexcombs/Downloads/news_model_pickle.p', 'wb'))

In [0]:
pickle.dump(vect, open(r'/Users/alexcombs/Downloads/news_vect_pickle.p', 'wb'))