## Financial and technology articles taken from [webhose.io](https://webhose.io/datasets)

In [1]:
import pandas as pd
import json,urllib3
import glob
%matplotlib inline

## Take a look at one JSON file

In [2]:
with open('data/news_0000001.json','r', encoding='utf-8') as inFile:
    d=json.loads(inFile.read())

In [3]:
print(d.keys())

dict_keys(['organizations', 'uuid', 'thread', 'author', 'url', 'ord_in_thread', 'title', 'locations', 'entities', 'highlightText', 'language', 'persons', 'text', 'external_links', 'published', 'crawled', 'highlightTitle'])


In [4]:
print(d['text'])

Photo: Andrew Harnik, AP Image 1 of / 3 Caption
Close
Image 1 of 3 Pope Francis waves from a Fiat 500 as his motorcade departs from Andrews Air Force Base, Md., Tuesday, Sept. 22, 2015, where President and Mrs. Obama welcomed him.
Photo: Andrew Harnik, AP Pope Francis waves from a Fiat 500 as his motorcade departs from... Image 2 of 3 Pope Francis arrives at the Apostolic Nunciature, the Vatican's diplomatic mission in the heart of Washington, Tuesday, Sept. 22, 2015, in a Fiat 500. Pope Francis will visit the White House on Wednesday, becoming only the third pope to visit the White House.
Photo: Cliff Owen, AP Pope Francis arrives at the Apostolic Nunciature, the Vatican's... Image 3 of 3 Pope Francis waves from a Fiat 500L as his motorcade departs Andrews Air Force Base, Md., Tuesday, Sept. 22, 2015, after being greeted by President Barack Obama and first lady Michelle Obama.
Photo: Andrew Harnik, AP Pope Francis waves from a Fiat 500L as his motorcade departs... Who needs a limo? Po

## Define a function to open a file and get the text

In [31]:
def getText(f):
    with open(f,'r', encoding='utf-8') as inFile:
        d=json.loads(inFile.read())
    return str(d['text'])

In [32]:
%time financeTexts=map(getText,glob.glob('../f/data/news*json'))

Wall time: 56 ms


In [33]:
print(financeTexts)


<map object at 0x0000022678E1E240>


In [35]:
#(techTexts)

## Combine tech and financial news into one dataframe

In [36]:
df=pd.DataFrame(data={'text':list(financeTexts),'category':'finance'})
df.head()

Unnamed: 0,text,category
0,15 August 2015 18:06 CET Defaulters owe SSNI...,finance
1,(Repeats story that ran on Sunday with no chan...,finance
2,"(Adds comments, updates bond prices and result...",finance
3,BERLIN Aug 10 A swift conclusion to negotiatio...,finance
4,KUALA LUMPUR Aug 10 Malaysia said on Monday it...,finance


In [38]:
%time techTexts=map(getText,glob.glob('../tech_news/data/news*json'))


Wall time: 103 ms


In [41]:
df=df.append(pd.DataFrame(data={'text':list(techTexts),'category':'tech'}))


Unnamed: 0,text,category
0,15 August 2015 18:06 CET Defaulters owe SSNI...,finance
1,(Repeats story that ran on Sunday with no chan...,finance
2,"(Adds comments, updates bond prices and result...",finance
3,BERLIN Aug 10 A swift conclusion to negotiatio...,finance
4,KUALA LUMPUR Aug 10 Malaysia said on Monday it...,finance


In [42]:
df.tail()
df.shape

(34270, 2)

In [43]:
df.head()

Unnamed: 0,text,category
0,15 August 2015 18:06 CET Defaulters owe SSNI...,finance
1,(Repeats story that ran on Sunday with no chan...,finance
2,"(Adds comments, updates bond prices and result...",finance
3,BERLIN Aug 10 A swift conclusion to negotiatio...,finance
4,KUALA LUMPUR Aug 10 Malaysia said on Monday it...,finance


## Build up a pipeline

In [44]:
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier,LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import f1_score
from sklearn import preprocessing

## Binarise the category labels

In [45]:
lb = preprocessing.LabelBinarizer()

In [46]:
lb.fit(df['category'])
df['category_bin']=lb.transform(df['category'])

## Test Naive Bayes Classifier fr our baseline

In [47]:
steps=[('vectorise',CountVectorizer()),\
       ('transform',TfidfTransformer()),\
       ('clf',MultinomialNB())]
# Our pipeline has three steps

In [48]:
pipe=Pipeline(steps)

In [49]:
X_train, X_test, y_train, y_test=\
train_test_split(df['text'],df['category_bin'],test_size=0.25)

In [50]:
pipe.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('vectorise', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        ...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [51]:
pred=pipe.predict(X_test)

In [53]:
print ('Accuracy = %.3f' % f1_score(y_test,pred))

Accuracy = 0.897


## Write out model

In [54]:
import pickle
with open('model.out','wb') as outFile:
    pickle.dump(pipe,outFile)

TypeError: write() argument must be str, not bytes

In [55]:
lb

LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)

## Video 4.3

## Grid Search

In [48]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

In [49]:
pipe.named_steps.keys()

dict_keys(['vectorise', 'transform', 'clf'])

In [50]:
param_grid = dict(vectorise__stop_words=[None,'english'],\
                  vectorise__binary=[True,False],\
                  #vectorise__min_df=[1,5,10],\
                  #clf__class_weight=[None,'balanced'],\
                  #transform__norm=['l1','l2']
                 )

In [51]:
grid_search = GridSearchCV(pipe, param_grid=param_grid,\
                           scoring=make_scorer(f1_score),n_jobs=2)
# With n_jobs=1, takes 10.33
# With n_jobs=-1 takes YYYs

In [52]:
%time res=grid_search.fit(df['text'],df['category_bin'])



Wall time: 1min 34s


  self.class_log_prior_ = (np.log(self.class_count_) -


In [53]:
res.best_params_

{'vectorise__binary': True, 'vectorise__stop_words': None}

In [56]:
print('Best score = %.3f' % res.best_score_)

Best score = 0.000


## Compare classifiers

In [57]:
CountVectorizer()

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [60]:
for clf in [SGDClassifier(),LogisticRegression(),RandomForestClassifier()]:
    print (clf.__class__)
    steps=[('vectorise',CountVectorizer()),('transform',TfidfTransformer()),\
           ('clf',clf)]
    pipe=Pipeline(steps)
    pipe.set_params(vectorise__decode_error='ignore')
    
    grid_search = GridSearchCV(pipe, param_grid=param_grid,n_jobs=-1,\
                           scoring=make_scorer(f1_score))

    res=grid_search.fit(df['text'],df['category_bin'])
    
    print ('Best score = %.3f' % res.best_score_)
    print (res.best_params_)
    print ('')

<class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'>




ValueError: The number of classes has to be greater than one; got 1 class