## Chapter 10 (3)

### Sentiment Analysis Model

### TextBlob package

In [1]:
from textblob import TextBlob

In [2]:
text1 = "Bayer (OTCPK:BAYRY) started the week up 3.5% to €74/share in Frankfurt, touching their \
highest level in 14 months, after the U.S. government said a $25M glyphosate decision against the \
company should be reversed."

In [3]:
TextBlob(text1).sentiment.polarity

0.5

In [4]:
TextBlob(text1).sentiment_assessments

Sentiment(polarity=0.5, subjectivity=0.5, assessments=[(['touching'], 0.5, 0.5, None)])

### Exercise

#### Sample data

In [5]:
import zipfile
import json

z = zipfile.ZipFile("Raw Headline Data.zip", "r")
testFile=z.namelist()[10]
fileData= z.open(testFile).read()
fileDataSample = json.loads(fileData)['content'][1:500]
fileDataSample

'li class="n-box-item date-title" data-end="1305172799" data-start="1305086400" data-txt="Tuesday, December 17, 2019">Wednesday, May 11, 2011</li><li class="n-box-item sa-box-item" data-id="76179" data-ts="1305149244"><div class="media media-overflow-fix"><div class="media-left"><a class="box-ticker" href="/symbol/CSCO" target="_blank">CSCO</a></div><div class="media-body"><h4 class="media-heading"><a href="/news/76179" sasource="on_the_move_news_fidelity" target="_blank">Cisco (NASDAQ:CSCO): Pr'

In [6]:
from lxml import etree
from io import StringIO

def jsonParser(json_data):
    xml_data = json_data['content']

    tree = etree.parse(StringIO(xml_data), parser=etree.HTMLParser())

    headlines = tree.xpath("//h4[contains(@class, 'media-heading')]/a/text()")
    assert len(headlines) == json_data['count']

    main_tickers = list(map(lambda x: x.replace('/symbol/', ''), tree.xpath("//div[contains(@class, 'media-left')]//a/@href")))
    assert len(main_tickers) == json_data['count']

    final_headlines = [''.join(f.xpath('.//text()')) for f in tree.xpath("//div[contains(@class, 'media-body')]/ul/li[1]")]
    if len(final_headlines) == 0:
        final_headlines = [''.join(f.xpath('.//text()')) for f in tree.xpath("//div[contains(@class, 'media-body')]")]
        final_headlines = [f.replace(h, '').split('\xa0')[0].strip() for f,h in zip (final_headlines, headlines)]

    return main_tickers, final_headlines

In [7]:
jsonParser(json.loads(fileData))[1][1]

'Cisco Systems (NASDAQ:CSCO) falls further into the red on FQ4 guidance of $0.37-0.39 vs. $0.42 Street consensus. Sales seen flat to +2% vs. 8% Street view. CSCO recently -2.1%.'

In [8]:
from textblob import TextBlob

print('Sentiment score: ', TextBlob(jsonParser(json.loads(fileData))[1][1]).sentiment.polarity)
print('Sentiment assessments: ', TextBlob(jsonParser(json.loads(fileData))[1][1]).sentiment_assessments)

Sentiment score:  -0.00625
Sentiment assessments:  Sentiment(polarity=-0.00625, subjectivity=0.21875, assessments=[(['further'], 0.0, 0.5, None), (['red'], 0.0, 0.0, None), (['flat'], -0.025, 0.125, None), (['recently'], 0.0, 0.25, None)])


#### Full data

In [9]:
import pandas as pd
import datetime
from datetime import date

data_df_news = pd.DataFrame()
with zipfile.ZipFile("Raw Headline Data.zip", "r") as z:
    for filename in z.namelist():
        try:
            with z.open(filename) as f:
                data = f.read()
                json_data = json.loads(data)
            if json_data.get('count', 0)> 10:
                #Step 1: Parse the News Jsons
                main_tickers, final_headlines = jsonParser(json_data)
                if len(final_headlines) != json_data['count']:
                    continue

                #Step 2: Prepare Future and Event Return and assign Future and Event return for each ticker.
                file_date = filename.split('/')[-1].replace('.json', '')
                file_date = date(int(file_date[:4]), int(file_date[5:7]), int(file_date[8:]))

                #Step 3: Merge all the data in a data frame
                df_dict = {'ticker': main_tickers,
                           'headline': final_headlines,
                           'date': [file_date] * len(main_tickers)
                           }
                df_f = pd.DataFrame(df_dict)
                data_df_news = data_df_news.append(df_f)
        except:
            pass

In [10]:
len(data_df_news)

0

In [14]:
data_df_news.head()

In [12]:
data_df_news[data_df_news.ticker=='CSCO'].head()

AttributeError: 'DataFrame' object has no attribute 'ticker'

In [13]:
data_df_news[data_df_news.ticker=='CSCO'].iloc[1]['headline']

AttributeError: 'DataFrame' object has no attribute 'ticker'

In [None]:
TextBlob(data_df_news[data_df_news.ticker=='CSCO'].iloc[1]['headline']).sentiment.polarity

In [None]:
# Apply to all headlines
data_df_news['sentiment_score'] = data_df_news['headline'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [None]:
data_df_news.head()

In [None]:
data_df_news[data_df_news.ticker=='CSCO'].head()

### Q1. How many headlines have a sentiment score of 1?


In [None]:
len(data_df_news)

In [None]:
data_df_news.head()

### Q2. Apply sentiment_assessments of TextBlob to one of the headlines with a sentiment score 1.

### Q3. How many headlines have a sentiment score of -1?


### Q4. Apply sentiment_assessments of TextBlob to one of the headlines with a sentiment score -1.
