#### Import packages

In [None]:
import nltk, requests, re, string, pickle, enchant
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import twitter_samples, stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk import FreqDist, classify, NaiveBayesClassifier
from datetime import datetime, timedelta

API_URL = "########"
ACCESS_TOKEN ="########"

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('punkt')

#### Import trained model

In [11]:
f = open('news_classifier.pickle', 'rb')
classifier = pickle.load(f)
f.close()

#### Check if string has numbers method

In [12]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

#### Remove noise method

In [13]:
def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        
        d = enchant.Dict("en_US")
        if d.check(token.lower()) != True:
            token = ""
        
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)
        
        if hasNumbers(token):
            token = ""

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

#### API Call Method

In [14]:
def get_etf_news(etf):
    query = {''}
    response = requests.get(API_URL, params=query)
    result = response.json()
    if result:
        stories = result['stories']
        return stories
    return []
    

#### Tokenize articles' description (Data cleaning) method

In [15]:
def clean_data(stories):
    stop_words = stopwords.words('english')
    stock_news_tokens = []
    
    for x in stories:
        if x['description']:
            temp_token = remove_noise(word_tokenize(x['description']),stop_words)
            stock_news_tokens.append(temp_token)
        
        
    return stock_news_tokens

#### Get ETF sentiment method

In [16]:
def get_sentiment(etf):
    print(f'Doing sentiment for {etf}')
    stories = get_etf_news(etf)
    if not stories:
        return 0
    stock_news_tokens = clean_data(stories)
    positive_count = 0
    for stock_token in stock_news_tokens:
        sentiment = classifier.classify(dict([token, True] for token in stock_token))
        if sentiment == "Positive":
            positive_count += 1
    
    if(len(stock_news_tokens) == 0):
        return 0
    
    sentiment_rating = ((positive_count / len(stock_news_tokens))*100)/100
    return sentiment_rating

#### Get sector ETFs

In [19]:
sector_list = ['Utilities']
etf_sectors_df = pd.read_csv('etf_sectors_short.csv')
etf_list_df = etf_sectors_df[etf_sectors_df['Sector'].isin(sector_list)]

ratings_list = []

for index, row in etf_list_df.iterrows():
    curr_etf = row['ETF']
    print(f'Currently checking {curr_etf}')
    sentiment_rating = get_sentiment(row['ETF'])
    ratings_list.append(sentiment_rating)

etf_list_df['Sentiment'] = ratings_list
print(etf_list_df)

Currently checking XLU
Doing sentiment for XLU
Currently checking VPU
Doing sentiment for VPU
    ETF     Sector  Sentiment
16  XLU  Utilities        0.5
17  VPU  Utilities        0.0


In [22]:
top_etf = etf_list_df.loc[etf_list_df['Sentiment'].idxmax()]['ETF']
print(top_etf)

XLU
