# Sentence Sentiments

## Load Crawled Data and Build Dataframe

### Libraries

In [0]:
import os
from google.colab import drive
import json
import pandas as pd
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter, defaultdict
import numpy as np 
from tqdm import tqdm
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import matplotlib
import matplotlib.dates as mdates
from textblob import TextBlob
import nltk

In [0]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:
text = ""
a_list = nltk.tokenize.sent_tokenize('\n\n\n\n\xa0\n\n•\n\nSteel Storage Containers.')

### Mounting (need to copy data to own Google Drive)

In [0]:
drive.mount('/content/gdrive')
dir = "gdrive/My Drive/10kparagraphs"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
## sentiment analysis
def sentiment_analysis(text):

  blob = TextBlob(str(text))
  text_polarity = blob.sentiment.polarity
  text_subjectivity = blob.sentiment.subjectivity

  return text_polarity, text_subjectivity

## word count
def get_word_counts(text, keywords):

  text = text.lower()

  corona_count = 0
  rel_corona_count = 0.0

  n_words = len(text.split(" "))

  ## corona mention feature
  if n_words >= 1:
    for k in keywords:
      corona_count += text.count(str(k))

    rel_corona_count = corona_count / n_words

  return corona_count, rel_corona_count


In [0]:
df = pd.DataFrame()
keywords = ["corona","covid"]

for filename in tqdm(os.listdir(dir)):
    if filename.endswith(".JSON"):
      
        json_path = os.path.join(dir, filename)
     
        with open(json_path) as f:
            json_dict = json.load(f)

            if '0' in json_dict.keys(): ## article contains data
              paragraphs = json_dict['0'].keys()

              for p in paragraphs:
                text = json_dict['0'][p]

                sentence_list = nltk.tokenize.sent_tokenize(text)

                for sentence in sentence_list:

                  sentiment, subjectivity = sentiment_analysis(sentence)
                  corona_count, rel_corona_count = get_word_counts(sentence, keywords)

                  ## append everything to Dataframe
                  df = df.append({'JSON_file': filename,'sentence_text': sentence,'contains_corona': corona_count, 'relative_corona_count': rel_corona_count,'sentiment': sentiment}, ignore_index=True)

    else:
        continue





  0%|          | 0/1763 [00:00<?, ?it/s][A[A[A[A



  0%|          | 1/1763 [00:02<1:06:19,  2.26s/it][A[A[A[A



  0%|          | 2/1763 [00:11<2:04:49,  4.25s/it][A[A[A[A



  0%|          | 3/1763 [00:13<1:45:10,  3.59s/it][A[A[A[A



  0%|          | 4/1763 [00:14<1:26:14,  2.94s/it][A[A[A[A



  0%|          | 5/1763 [00:21<2:03:58,  4.23s/it][A[A[A[A



  0%|          | 6/1763 [00:29<2:37:02,  5.36s/it][A[A[A[A



  0%|          | 7/1763 [00:33<2:17:49,  4.71s/it][A[A[A[A



  0%|          | 8/1763 [00:35<1:56:40,  3.99s/it][A[A[A[A



  1%|          | 9/1763 [00:36<1:33:44,  3.21s/it][A[A[A[A



  1%|          | 11/1763 [00:37<1:07:06,  2.30s/it][A[A[A[A



  1%|          | 12/1763 [00:37<51:48,  1.78s/it]  [A[A[A[A



  1%|          | 13/1763 [00:41<1:09:05,  2.37s/it][A[A[A[A



  1%|          | 14/1763 [00:41<49:52,  1.71s/it]  [A[A[A[A



  1%|          | 15/1763 [00:41<36:10,  1.24s/it][A[A[A[A



  1%|▏       

In [0]:
df.to_pickle("gdrive/My Drive/corona_all_sentences_sentiment.pkl")
df_no_text = df.drop(columns=["raw_text"])
df_no_text.to_csv("gdrive/My Drive/corona_all_sentences_sentiment.csv", encoding = 'utf-8', index=False)

## Only Corona Sentences

In [0]:
df = pd.read_csv(open("gdrive/My Drive/corona_sentences.csv", mode = "rb"), delimiter = ";")
df = df[['sic','cik','store','sentences']]

In [0]:
## sentiment analysis
def row_sentiment_analysis(row):

  sentences = row["sentences"]
  blob = TextBlob(str(sentences))
  text_polarity = blob.sentiment.polarity
  text_subjectivity = blob.sentiment.subjectivity

  row["sentiment"] = text_polarity
  row["subjectivity"] = text_subjectivity

  return row


df = df.apply(lambda x: row_sentiment_analysis(x), axis = 1)

In [0]:
df.to_pickle("gdrive/My Drive/corona_sentences_sentiment.pkl")
#df_no_text = df.drop(columns=["raw_text"])
df.to_csv("gdrive/My Drive/corona_sentences_sentiment.csv", sep=';', encoding = 'utf-8', index=False)