# Assignment 4: SentimentAnalysis

In [1]:
import re
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
df = pd.read_csv('training.1600000.processed.noemoticon.csv',encoding = "ISO-8859-1", names=['polarity', 'id','time', 'query','user', 'text'] )


Your assignment is to perform sentiment analysis on the tweets. You can use one of two approaches to analyze the data:

1. Using a machine learning classifier such as a Naive Bayes Classifier. Select 15,000 tweets randomly from each of the 3 categories (0 = negative, 2 = neutral, 4 = positive)to build your training dataset. Select 3000 tweets randomly from the 3 categories to build your test set. Make sure there is no overlap between the training and test sets. Use the training data to train your model and the test data to evaluate the performance of your model. Report the Precision, Recall, and F-1 score of the model. 

2. Using a lexicon based approach. You can use Textblob or any other Python implementation to compute sentiment for each tweet. Select 15,000 tweets randomly from each of the 3 categories (0 = negative, 2 = neutral, 4 = positive) to build a dataset. Predict the sentiment for these tweets using your model and compare the model's predicted sentiment to the actual sentiment reported in the data (Column 0). Report the Precision, Recall, and F-1 score of the model.

Submit an ipynb file along with a report discussing your results.


Useful links:
1. https://www.dataquest.io/blog/naive-bayes-tutorial/
2. https://towardsdatascience.com/creating-the-twitter-sentiment-analysis-program-in-python-with-naive-bayes-classification-672e5589a7ed
3. https://triton.ml/blog/sentiment-analysis 
4. https://medium.freecodecamp.org/how-to-build-a-twitter-sentiments-analyzer-in-python-using-textblob-948e1e8aae14
5. https://medium.com/analytics-vidhya/simplifying-social-media-sentiment-analysis-using-vader-in-python-f9e6ec6fc52f

In [2]:
df.head(3)

Unnamed: 0,polarity,id,time,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...


In [3]:
print(df['polarity'].value_counts())

4    800000
0    800000
Name: polarity, dtype: int64


Values are equally split 50% positive and 50% negative, but there are no neutral tweets.

### Split, Sample Data, Combine Data

In [4]:
def preprocess_text(text):
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL', text)
    #text = re.sub('@[^\s]+','USER', text)
    text = text.lower()    
    #text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    text=re.sub("(\\d|\\W)+"," ",text)
    text = re.sub(' +',' ', text)
    return text.strip()
def get_top_n_words(words, n ,stopwords):
    word_count_vector = CountVectorizer(max_df=0.95,stop_words=stopwords).fit(words)
    bag_of_words = word_count_vector.transform(words)
    word_sums = bag_of_words.sum(axis=0) 
    words_freq = [(word, word_sums[0, idx]) for word, idx in word_count_vector.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
 
def get_stop_words(stop_file_path):
#read in stop list words
#https://gist.github.com/CristhianBoujon/c719ba2287a630a6d3821d37a9608ac8/cd8308e5ab8ceae3d13c363a67c154c83e560926
    with open(stop_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
    return frozenset(stop_set)

In [5]:
df['processed_text']= [preprocess_text(t) for t in df['text']]

In [6]:
df['processed_text'].head(3)
positivetweets = df[df['polarity'] == 4].sample((int)(45000/2), replace=False)
negativetweets = df[df['polarity'] == 0].sample((int)(45000/2),replace=False)

In [7]:
#get sample stopwords file
stopwords=get_stop_words("stopwords_en.txt")
#processed positive tweets text
docs=positivetweets['processed_text'].tolist()
most_common_positive=get_top_n_words(docs,25,stopwords)
print(most_common_positive)

  'stop_words.' % sorted(inconsistent))


[('good', 1742), ('url', 1444), ('day', 1351), ('love', 1335), ('quot', 1289), ('lol', 1054), ('time', 876), ('today', 843), ('amp', 757), ('great', 690), ('ll', 670), ('back', 653), ('night', 645), ('happy', 616), ('haha', 607), ('morning', 586), ('im', 567), ('hope', 546), ('twitter', 520), ('don', 514), ('work', 503), ('fun', 502), ('nice', 494), ('home', 436), ('tomorrow', 432)]


In [24]:
#get sample stopwords file
stopwords=get_stop_words("twitter-stopwords.txt")
#processed negative tweets text
docs=negativetweets['processed_text'].tolist()
most_common_negative=get_top_n_words(docs,25,stopwords)
print(most_common_negative)

  'stop_words.' % sorted(inconsistent))


[('to', 8867), ('the', 7240), ('my', 5434), ('it', 4361), ('and', 4260), ('is', 3640), ('in', 3239), ('you', 2927), ('for', 2736), ('of', 2544), ('me', 2510), ('so', 2504), ('on', 2352), ('but', 2305), ('that', 2304), ('have', 2288), ('not', 2026), ('just', 1813), ('at', 1794), ('be', 1692), ('was', 1645), ('this', 1530), ('now', 1515), ('no', 1481), ('can', 1439)]


In [25]:
sampleframe= [positivetweets,negativetweets]
df = pd.concat(sampleframe)

1. Using a machine learning classifier such as a Naive Bayes Classifier. Select 15,000 tweets randomly from each of the 3 categories (0 = negative, 2 = neutral, 4 = positive)to build your training dataset. Select 3000 tweets randomly from the 3 categories to build your test set. Make sure there is no overlap between the training and test sets. Use the training data to train your model and the test data to evaluate the performance of your model. Report the Precision, Recall, and F-1 score of the model.

In [26]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV

text_clf = Pipeline([('vect', CountVectorizer()),#tokenizing text
                     ('tfidf', TfidfTransformer()),#extract term frequencey times
                     ('clf', MultinomialNB())])# multinomial Naives Bayes
tuned_parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': [1, 1e-1, 1e-2]
}

In [27]:
data = df['processed_text']
labels = df['polarity']
categories = [0,4]


In [28]:
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.3, random_state=40)
text_clf.fit(x_train,y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [29]:
predicted = text_clf.predict(x_test)

In [30]:
np.mean(predicted == y_test)

0.754

In [31]:
from sklearn import metrics
print(metrics.classification_report(y_test,predicted,digits=3))

              precision    recall  f1-score   support

           0      0.721     0.829     0.771      6743
           4      0.799     0.679     0.734      6757

   micro avg      0.754     0.754     0.754     13500
   macro avg      0.760     0.754     0.753     13500
weighted avg      0.760     0.754     0.753     13500



## Using Stop Word Library
#read in stop list words
#https://sites.google.com/site/kevinbouge/stopwords-lists

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV

text_clf = Pipeline([('vect', CountVectorizer(max_df=0.85,stop_words=stopwords)),#tokenizing text
                     ('tfidf', TfidfTransformer()),#extract term frequencey times
                     ('clf', MultinomialNB())])# multinomial Naives Bayes
tuned_parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': [1, 1e-1, 1e-2]
}

In [33]:
data = df['processed_text']
labels = df['polarity']
categories = [0,4]

In [34]:
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.3, random_state=42)
text_clf.fit(x_train,y_train)

  'stop_words.' % sorted(inconsistent))


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.85, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=frozenset(...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [35]:
predicted = text_clf.predict(x_test)

In [36]:
np.mean(predicted == y_test)

0.7499259259259259

In [37]:
from sklearn import metrics
print(metrics.classification_report(y_test,predicted,digits=3))

              precision    recall  f1-score   support

           0      0.714     0.826     0.766      6688
           4      0.798     0.675     0.732      6812

   micro avg      0.750     0.750     0.750     13500
   macro avg      0.756     0.751     0.749     13500
weighted avg      0.756     0.750     0.749     13500

