#### Import nltk packages

In [25]:
import nltk, pickle, re, string, csv, random
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import twitter_samples, stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk import FreqDist, classify, NaiveBayesClassifier
import enchant

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\adam_\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\adam_\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adam_\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#### Noise removal method

In [37]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

In [38]:
def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        
        d = enchant.Dict("en_US")
        if d.check(token.lower()) != True:
            token = ""
            
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        
        token = re.sub("(@[A-Za-z0-9_]+)","", token)
        
        
        if hasNumbers(token):
            token = ""

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

#### Get texts from csv

In [39]:
positive_news = []
negative_news = []
neautral_news = []
with open("compiled_data_encoded.csv", encoding="utf-8") as f:
    reader = csv.reader(f)

    header = next(reader)
    found = False
    for line in reader:
        if line[0] == "positive":
            positive_news.append(line[1])
        elif line[0] == "negative":
            negative_news.append(line[1])
        else:
            neautral_news.append(line[1])

#### Tokenize and clean words

In [40]:
tknzr = TweetTokenizer()
stop_words = stopwords.words('english')

positive_cleaned_news_tokens = []
negative_cleaned_news_tokens = []
neutral_cleaned_news_tokens = []

for x in positive_news:
    positive_cleaned_news_tokens.append(remove_noise(tknzr.tokenize(x), stop_words))
    
for y in negative_news:
    negative_cleaned_news_tokens.append(remove_noise(tknzr.tokenize(y), stop_words))
    
for z in neautral_news:
    neutral_cleaned_news_tokens.append(remove_noise(tknzr.tokenize(z), stop_words))

#### Convert tokens to dictionary

In [41]:
def get_news_for_model(cleaned_tokens_list):
    for news_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in news_tokens)
        
positive_tokens_for_model = get_news_for_model(positive_cleaned_news_tokens)
negative_tokens_for_model = get_news_for_model(negative_cleaned_news_tokens)
neutral_tokens_for_model = get_news_for_model(neutral_cleaned_news_tokens)

#### Splitting the Dataset for Training and Testing the Model

In [42]:
positive_dataset = [(news_dict, "Positive") for news_dict in positive_tokens_for_model]
negative_dataset = [(news_dict, "Negative") for news_dict in negative_tokens_for_model]
neutral_dataset = [(news_dict, "Neutral") for news_dict in neutral_tokens_for_model]

dataset = positive_dataset + negative_dataset + neutral_dataset
random.shuffle(dataset)

train_data = dataset[:7446]
test_data = dataset[7446:]

#### Building and testing the model

In [43]:
classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(15))

Accuracy is: 0.6949843260188088
Most Informative Features
                   short = True           Negati : Neutra =    163.4 : 1.0
                     ... = True           Negati : Neutra =    113.4 : 1.0
                   break = True           Positi : Neutra =     49.9 : 1.0
                building = True           Neutra : Negati =     30.5 : 1.0
             headquarter = True           Neutra : Positi =     28.8 : 1.0
                  design = True           Neutra : Negati =     28.6 : 1.0
                decrease = True           Negati : Positi =     27.4 : 1.0
                    stop = True           Positi : Neutra =     24.7 : 1.0
                  weekly = True           Negati : Neutra =     23.8 : 1.0
                 acquire = True           Neutra : Negati =     22.7 : 1.0
                   house = True           Neutra : Positi =     21.4 : 1.0
                  relate = True           Neutra : Positi =     21.4 : 1.0
             transaction = True           

#### Saving model

In [44]:
f = open('news_classifier.pickle', 'wb')
pickle.dump(classifier, f)
f.close()

In [5]:
y_origin = ['positive', 'positive', 'negative', 'negative', 'neutral']
y_pred = ['positive', 'negative', 'negative', 'neutral', 'neutral']
print(f'y_origin = {y_origin}')
print(f'y_pred = {y_pred}')

y_origin = ['positive', 'positive', 'negative', 'negative', 'neutral']
y_pred = ['positive', 'negative', 'negative', 'neutral', 'neutral']


#### Confusion matrix

In [46]:
combined_tokens = positive_cleaned_news_tokens + negative_cleaned_news_tokens + neutral_cleaned_news_tokens
y_pred = []

for stock_token in combined_tokens:
    y_pred.append(classifier.classify(dict([token, True] for token in stock_token)).lower())
    
y_origin = (['positive'] * len(positive_cleaned_news_tokens)) + (['negative'] * len(negative_cleaned_news_tokens)) + (['neutral'] * len(neutral_cleaned_news_tokens))

In [50]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_origin,y_pred,labels=["positive", "negative", "neutral"])

array([[3626,  549,  873],
       [ 465, 1988,  257],
       [ 115,   83, 2680]], dtype=int64)