# Install Dependencies

In [None]:
# !pip install emoji
# !pip install nltk
# !pip install pandas
# !pip install -U spacy
# !python -m spacy download en_core_web_sm

# Run

In [None]:
import json
import re
import csv
import spacy
import nltk
import en_core_web_sm
import pandas as pd
from emoji import get_emoji_regexp

# nltk.download('wordnet')
# nltk.download('vader_lexicon')
# nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk import FreqDist
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

In [None]:
# Files
input = './input.csv'
output = './output.csv'

# Load tickers
df = pd.read_csv('./cleaned_tickers.csv')
tickers = df['ticker'].tolist()
row_num = 0

In [None]:
def get_sentiment(text):
    # Remove emojis if exists
    text = get_emoji_regexp().sub(u'', text)
    text = re.sub(r'\sand\s|\sor\s', '. ', text)
    tokenized_str = sent_tokenize(text)

    # Remove stop words
    nlp = en_core_web_sm.load()
    all_stopwords = nlp.Defaults.stop_words
    tokens_without_sw = [word for word in tokenized_str if word not in all_stopwords]

    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = ([lemmatizer.lemmatize(word) for word in tokens_without_sw])
    cleaned_output = lemmatized_tokens

    # Apply a sentiment analyzer
    sia = SIA()
    result = dict()

    for sentence in cleaned_output:
        pol_score = sia.polarity_scores(sentence)
        tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|http\S+')
        words = tokenizer.tokenize(sentence)
        ticker = None

        for word in words:
            if word in tickers:
                ticker = word

        if not ticker:
            continue

        if ticker in result:
            result[ticker] = pol_score['compound'] if pol_score['compound'] > result[ticker] else result[ticker]
        else:
            result[ticker] = pol_score['compound']

    data = []
    for ticker, sentiment_score in result.items():
        data.append({
            'ticker': ticker,
            'sentiment_score': sentiment_score
        })
    return data

In [None]:
with open(output, 'w', encoding='utf8') as csv_file:
    fields = ['Post Title', 'Post Body Text', 'Google Natural Language API Output', 'Library Output']
    writer = csv.writer(csv_file, lineterminator='\n')
    writer.writerow(fields)
    
with open(input, encoding='utf8') as csv_file:
    reader = csv.reader(csv_file, dialect=csv.excel)    
    for row in reader:
        row_num = row_num + 1
        if row_num == 1:
            continue
        with open(output, 'a', encoding='utf8', newline='') as csv_file:
            writer = csv.writer(csv_file, lineterminator='\n')
            sentiment = get_sentiment(f'{row[0]}\n\n{row[1]}')
            writer.writerow([row[0], row[1], row[2], str(sentiment)])
print('completed')