# Import packages

1) Install the following:
- tqdm
- spaCy
- python -m spacy download en_core_web_sm

These should all be installed with the bash script that you attached when starting the app in UCloud.

In [None]:
# Data analysis
import os
import pandas as pd
from collections import Counter
from tqdm import tqdm

# NLP
import spacy
nlp = spacy.load("en_core_web_sm")

# sentiment analysis VADER
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
# sentiment with spacyTextBlob
from spacytextblob.spacytextblob import SpacyTextBlob
nlp.add_pipe('spacytextblob')

# visualisations
import matplotlib.pyplot as plt

# Load the data

We're going to use the "fake news" dataset in the shared drive today.

In [None]:
# get the filepath
filename = os.path.join("..", "..", "CDS-LANG", "tabular_examples", "fake_or_real_news.csv")

In [None]:
# load the data
data = pd.read_csv(filename)

# Sentiment Analysis with VADER

Let's look at text at index 3 in our data - we'll just work with the headlines, rather than the full articles.

In [None]:
print(data["title"][3])

In [None]:
# get sentiment scores with VADER
analyzer.polarity_scores(data["title"][3])

In [None]:
# get for all headlines
vader_scores = []
for headline in data["title"]:
    score = analyzer.polarity_scores(headline)
    vader_scores.append(score)

In [None]:
# create a dataframe
vader_df = pd.DataFrame(vader_scores)

In [None]:
# display
vader_df

# Sentiment analysis with ```spaCyTextBlob```

A slightly different approach uses ```TextBlob``` via ```spaCy``` to do the sentiment analysis. To do this, we then need to first use our ```nlp()``` pipeline to create a ```Doc``` for each headline. The sentiment scores can then be found as attributes of each ```Doc```.

In [None]:
print(data["title"][3])

In [None]:
# get scores for one doc
test_doc = nlp(data["title"][3])

In [None]:
test_doc._.blob.polarity     

In [None]:
test_doc._.blob.subjectivity   

In [None]:
test_doc._.blob.sentiment_assessments.assessments

In [None]:
# get polarity scores for all headlines
polarity = []
for headline in data["title"]:
    doc = nlp(headline)
    score = doc._.blob.polarity
    polarity.append(score)

In [None]:
# get subjectivity scores
subjs = []
for headline in nlp(data["title"]):
    doc = nlp(headline)
    score = doc._.blob.subjectivity
    subjs.append(score)

# Doing NER with ```spaCy```

In [None]:
doc = nlp("My name is Ross")

We can then find every individual occurrence of some kind of named entity. Note that this returns all named entities, regardless of type.

In [None]:
# For every entity in the doc object
for token in doc.ents:
    # print the token and the NER label (NB: .label_ not .label)
    print(token.text, token.label_)

We can also iterate through the full data set and get a similar results for every headline.

In [None]:
ents = []
for posts in tqdm(nlp.pipe(data["title"], batch_size=500)):
    for entity in posts.ents:
        if entity.label_ == "GPE":
            ents.append(entity.text)

# Working with a longer text

We can also work for a longer text such as a novel, rather than a number of short texts.

In [None]:
# get filename
filename = os.path.join("..", "..", "CDS-LANG", "100_english_novels", "corpus", "Cbronte_Jane_1847.txt")

In [None]:
# load 
with open(filename, "r", encoding="utf-8") as file:
    text = file.read()

In [None]:
# get spacy Doc
nlp.max_length = 1500000
doc = nlp(text)

In [None]:
# sentence tokenization
polarity = []

for sentence in doc.sents:
    score = sentence._.blob.polarity
    polarity.append(score)

# Plotting sentiment over time

In [None]:
# plot polarity
plt.plot(polarity)

In [None]:
# smooth with a rolling window
smoothed_sentiment = pd.Series(polarity).rolling(500).mean()

In [None]:
# plot the results
plt.plot(smoothed_sentiment)