In [1]:
import pandas as pd
import re
import nltk
import unicodedata
import os

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
from google.cloud import language_v1

In [2]:
analyzer = SentimentIntensityAnalyzer()

In [2]:
def clean_strings(text):
    text = ' '.join(re.sub("(#[A-Za-z0-9]+)|(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
    return text

def tokenization(text):
    text = re.split('\W+', text)
    return text

def remove_stopwords(text):
    stopwords = nltk.corpus.stopwords.words('english') + ['amp']
    text = [word for word in text if word not in stopwords]
    return text

def join_tokens(text):
    text = ' '.join(text)
    return text

In [3]:
company_df = pd.read_csv('../Data/company_map.csv')

In [None]:
tweets_df = pd.read_csv('../Data/tweets.csv')
tweets_df = tweets_df.drop(['Tweet Id'], axis=1)

tweets_df['Text'] = tweets_df['Text'].apply(lambda x: clean_strings(x))
tweets_df['Text'] = tweets_df['Text'].apply(lambda x: tokenization(x.lower()))
tweets_df['Text'] = tweets_df['Text'].apply(lambda x: remove_stopwords(x))
tweets_df['Text'] = tweets_df['Text'].apply(lambda x: join_tokens(x))

tweets_df = tweets_df.merge(company_df[['Company', 'TweetKey']], left_on='Tag', right_on='TweetKey')
tweets_df = tweets_df.drop(['Tag', 'TweetKey', 'Datetime'], axis=1)

In [None]:
news_df = pd.read_csv('../Data/google_articles.csv', encoding = "ISO-8859-1")
news_df = news_df.drop(['link', 'title'], axis=1)

news_df['nlp_summary'] = news_df['nlp_summary'].apply(lambda x: clean_strings(x))
news_df['nlp_summary'] = news_df['nlp_summary'].apply(lambda x: tokenization(x.lower()))
news_df['nlp_summary'] = news_df['nlp_summary'].apply(lambda x: remove_stopwords(x))
news_df['nlp_summary'] = news_df['nlp_summary'].apply(lambda x: join_tokens(x))

news_df = news_df.merge(company_df[['Company', 'NewsKey']], left_on='company', right_on='NewsKey')
news_df = news_df.drop(['company', 'NewsKey', 'publish date'], axis=1)

In [None]:
blogs_df = pd.read_csv('../Data/huffpost_articles.csv', encoding = "ISO-8859-1")
blogs_df = blogs_df.drop(['link', 'title', 'description'], axis=1)

blogs_df['nlp_summary'] = blogs_df['nlp_summary'].apply(lambda x: clean_strings(x))
blogs_df['nlp_summary'] = blogs_df['nlp_summary'].apply(lambda x: tokenization(x.lower()))
blogs_df['nlp_summary'] = blogs_df['nlp_summary'].apply(lambda x: remove_stopwords(x))
blogs_df['nlp_summary'] = blogs_df['nlp_summary'].apply(lambda x: join_tokens(x))

blogs_df = blogs_df.merge(company_df[['Company', 'BlogKey']], left_on='company', right_on='BlogKey')
blogs_df = blogs_df.drop(['company', 'BlogKey', 'publish date'], axis=1)

In [4]:
reviews_df = pd.read_csv('../Data/yelp_reviews.csv')

reviews_df['text'] = reviews_df['text'].apply(lambda x: clean_strings(x))
reviews_df['text'] = reviews_df['text'].apply(lambda x: tokenization(x.lower()))
reviews_df['text'] = reviews_df['text'].apply(lambda x: remove_stopwords(x))
reviews_df['text'] = reviews_df['text'].apply(lambda x: join_tokens(x))

In [5]:
analyzer = SentimentIntensityAnalyzer()

In [None]:
tweets_df['VADER'] = tweets_df['Text'].apply(lambda tweet: analyzer.polarity_scores(tweet))
tweets_df['VADER'] = tweets_df['VADER'].apply(lambda score_dict: score_dict['compound'])

news_df['VADER'] = news_df['nlp_summary'].apply(lambda summary: analyzer.polarity_scores(summary))
news_df['VADER'] = news_df['VADER'].apply(lambda score_dict: score_dict['compound'])

blogs_df['VADER'] = blogs_df['nlp_summary'].apply(lambda summary: analyzer.polarity_scores(summary))
blogs_df['VADER'] = blogs_df['VADER'].apply(lambda score_dict: score_dict['compound'])

reviews_df['VADER'] = reviews_df['text'].apply(lambda review: analyzer.polarity_scores(review))
reviews_df['VADER'] = reviews_df['VADER'].apply(lambda score_dict: score_dict['compound'])

In [None]:
tweets_df['TextBlob'] = tweets_df['Text'].apply(lambda tweet: TextBlob(tweet).sentiment.polarity)

news_df['TextBlob'] = news_df['nlp_summary'].apply(lambda summary: TextBlob(summary).sentiment.polarity)

blogs_df['TextBlob'] = blogs_df['nlp_summary'].apply(lambda summary: TextBlob(summary).sentiment.polarity)

reviews_df['TextBlob'] = reviews_df['text'].apply(lambda review: TextBlob(review).sentiment.polarity)

In [7]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/Users/bilalmajeed/Desktop/MRP/code/key.json"

def analyze_text_sentiment(text):
    try:
        client = language_v1.LanguageServiceClient()
        document = language_v1.Document(content=text, type_=language_v1.Document.Type.PLAIN_TEXT)
        response = client.analyze_sentiment(document=document)
        sentiment = response.document_sentiment.score
    except:
        sentiment = -99

    return sentiment

In [None]:
tweets_df['GoogleAPI'] = tweets_df['Text'].apply(lambda tweet: analyze_text_sentiment(tweet))

news_df['GoogleAPI'] = news_df['nlp_summary'].apply(lambda summary: analyze_text_sentiment(summary))

blogs_df['GoogleAPI'] = blogs_df['nlp_summary'].apply(lambda summary: analyze_text_sentiment(summary))

reviews_df['GoogleAPI'] = reviews_df['text'].apply(lambda review: analyze_text_sentiment(review))

In [None]:
tweets_df.to_csv('../Sentiments/raw_sentiments/tweet_sentiments.csv', index=False, header=True)
news_df.to_csv('../Sentiments/raw_sentiments/news_sentiments.csv', index=False, header=True)
blogs_df.to_csv('../Sentiments/raw_sentiments/blogs_sentiments.csv', index=False, header=True)
reviews_df.to_csv('../Sentiments/raw_sentiments/reviews_sentiments.csv', index=False, header=True)

In [None]:
tweets_df = tweets_df.drop(["Text"], axis=1)
tweets_df = tweets_df.groupby('Company').mean()
tweets_df.to_csv("../Sentiments/tweets_company_sentiment.csv")

blogs_df = blogs_df.drop(["nlp_summary"], axis=1)
blogs_df = blogs_df.groupby('Company').mean()
blogs_df.to_csv("../Sentiments/blogs_company_sentiment.csv")

news_df = news_df.drop(["nlp_summary"], axis=1)
news_df = news_df.groupby('Company').mean()
news_df.to_csv("../Sentiments/news_company_sentiment.csv")

reviews_df = reviews_df.drop(["text"], axis=1)
reviews_df = reviews_df.groupby('Company').mean()
reviews_df.to_csv("../Sentiments/reviews_company_sentiment.csv")