In [32]:
import warnings
warnings.filterwarnings('ignore')

In [53]:
import re
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from textblob import TextBlob

In [67]:
df = pd.read_csv("finalInput.csv",encoding="ISO-8859-1")

NewsTextBlob_senti = []
for i in df['NewsContent']:
    sentiment = TextBlob(str(i))
    NewsTextBlob_senti.append(sentiment.sentiment.polarity)
df['NewsTextBlob_senti'] = NewsTextBlob_senti

df.head()

Unnamed: 0.1,Unnamed: 0,NewsContent,author,publishedDate,title,AAPL_company,IBM_company,GOOGL_company,AMZN_company,MSFT_company,AAPL_senti,IBM_senti,GOOGL_senti,AMZN_senti,MSFT_senti,NewsTextBlob_senti
0,0,"MILAN/ROME, March 27 (Reuters) - Italian stat...",,"March 27, 2019, 01:22:00 PM EDT",Italian state lender seeks to broker truce bet...,0,0,0,0,0,0,0,0,0,0,0.0
1,1,April 5 (Reuters) - Representatives of Venezu...,,"April 05, 2019, 06:23:00 PM EDT",Lawyers for Venezuela's Guaido ask U.S. court ...,0,0,0,0,0,0,0,0,0,0,-0.026667
2,2,By Tatiana Bautzer and Gram Slattery SAO PAUL...,,"April 05, 2019, 05:49:00 PM EDT",Petrobras agrees to sell pipeline unit to Engi...,0,0,0,0,0,0,0,0,0,0,-0.010278
3,3,Top Tech Stocks MSFT -1.59% AAPL +0.61% IBM -0...,MT Newswires,"March 27, 2019, 01:18:46 PM EDT","Technology Sector Update for 03/27/2019: NTWK,...",1,0,1,0,1,1,0,-1,0,-1,-0.018604
4,4,By Tatiana Bautzer and Gram Slattery SAO PAUL...,,"April 05, 2019, 05:49:00 PM EDT",Engie bids $8.6 bln for Petrobras pipeline uni...,0,0,0,0,0,0,0,0,0,0,-0.010278


In [16]:
df_news = df.drop(['Unnamed: 0','author','publishedDate','title'], axis = 1)

In [69]:
categories = ['AAPL_company','IBM_company','GOOGL_company','AMZN_company','MSFT_company','AAPL_senti','IBM_senti','GOOGL_senti','AMZN_senti','MSFT_senti']
train,test = train_test_split(df_news,random_state=42,test_size=0.2,shuffle=True)
x_train = train.NewsContent
x_test = test.NewsContent


pandas.core.frame.DataFrame

In [51]:
NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])

parametersNB = {'clf__estimator__alpha':np.linspace(0.5, 1.5, 6),
              #'clf__estimator__prior':[True,False]
               }

cvNB = GridSearchCV(NB_pipeline,param_grid = parametersNB)

for category in categories:
    print('... Processing {}'.format(category))
    cvNB.fit(x_train,train[category])
    prediction_NB = cvNB.predict(x_test)
    print("Tuned Best Score: ", cvNB.best_score_)
    print("Tuned Best Params: {}".format(cvNB.best_params_))
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction_NB)))
    print(classification_report(test[category],prediction))
    

... Processing AAPL_company
Tuned Best Score:  0.9078947368421053
Tuned Best Params: {'clf__estimator__alpha': 0.5}
Test accuracy is 0.8596491228070176
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         0
           0       0.73      0.94      0.82        32
           1       0.83      0.40      0.54        25

   micro avg       0.70      0.70      0.70        57
   macro avg       0.52      0.45      0.45        57
weighted avg       0.78      0.70      0.70        57

... Processing IBM_company
Tuned Best Score:  0.9736842105263158
Tuned Best Params: {'clf__estimator__alpha': 0.5}
Test accuracy is 0.9824561403508771
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         0
           0       0.95      0.95      0.95        41
           1       0.83      0.62      0.71        16

   micro avg       0.86      0.86      0.86        57
   macro avg       0.59      0.53      0.56 

In [49]:
SVC_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(svm.SVC(), n_jobs=1)),
            ])
parameters = {'clf__estimator__kernel':('linear','rbf','sigmoid','poly'),
              'clf__estimator__C':[1,50]}

cv = GridSearchCV(SVC_pipeline,param_grid = parameters)

for category in categories:
    print('... Processing {}'.format(category))
    cv.fit(x_train,train[category])
    prediction = cv.predict(x_test)
    print("Tuned Best Score: ", cv.best_score_)
    print("Tuned Best Params: {}".format(cv.best_params_))
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
    print(classification_report(test[category],prediction))

... Processing AAPL_company
Tuned Best Score:  0.9385964912280702
Tuned Best Params: {'clf__estimator__C': 50, 'clf__estimator__kernel': 'linear'}
Test accuracy is 0.9473684210526315
              precision    recall  f1-score   support

           0       0.94      0.97      0.95        32
           1       0.96      0.92      0.94        25

   micro avg       0.95      0.95      0.95        57
   macro avg       0.95      0.94      0.95        57
weighted avg       0.95      0.95      0.95        57

... Processing IBM_company
Tuned Best Score:  0.9780701754385965
Tuned Best Params: {'clf__estimator__C': 50, 'clf__estimator__kernel': 'linear'}
Test accuracy is 0.9824561403508771
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        41
           1       1.00      0.94      0.97        16

   micro avg       0.98      0.98      0.98        57
   macro avg       0.99      0.97      0.98        57
weighted avg       0.98      0.98   

In [92]:
# df = pd.read_csv("new20190421_predict.csv",encoding="ISO-8859-1")
predict_result = []
for category in categories:
    print('... Processing {}'.format(category))
    cv.fit(x_train,train[category])
    predicationTest = cv.predict(df.NewsContent)
    predict_result.append(predicationTest)
    #print('Test result is {}'.format(predicationTest))

... Processing AAPL_company
... Processing IBM_company
... Processing GOOGL_company
... Processing AMZN_company
... Processing MSFT_company
... Processing AAPL_senti
... Processing IBM_senti
... Processing GOOGL_senti
... Processing AMZN_senti
... Processing MSFT_senti


In [91]:
predict = pd.DataFrame(predict_result)
predict = predict.T