In [None]:
import nltk
import re

In [None]:
nltk.download('twitter_samples')

In [None]:
from nltk.corpus import twitter_samples

In [None]:
import pandas as pd
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')


In [None]:
print(positive_tweets)

In [None]:
print(positive_tweets[0])
original_tweet=positive_tweets[0]

In [None]:
#remove #hastags
tweet=re.sub(r'#+\S+',"",original_tweet)
tweet

In [None]:
#remove @
tweet=re.sub(r'@+\S+',"",tweet)
tweet

In [None]:
#remove links
print(positive_tweets[5])
tweet=re.sub(r'https?://\S+',"",positive_tweets[5])
print(tweet)
#remove any character which is not number or alphabet
tweet=re.sub(r'[^\w\s]',"",tweet)
tweet

In [None]:
#remove \n in text
tweet=re.sub(r'\n',"",tweet)
tweet

In [None]:
print(positive_tweets[8])
#remove emojis
tweet=re.sub(r':\S*',"",positive_tweets[8])
tweet

In [None]:
def clean_text(tweet):
    tweet=re.sub(r'#+\S+',"",tweet)  #remove hashtags
    #remove @
    tweet=re.sub(r'@+\S+',"",tweet)
    #remove links
    tweet=re.sub(r'https?://\S+',"",tweet)
    #remove \n in text
    tweet=re.sub(r'\n',"",tweet)
    #remove emojis
    tweet=re.sub(r':\S*',"",tweet)
    #remove phone numbers
    tweet=re.sub(r'\s\d+\s',"",tweet)
    #remove any character which is not number or alphabet
    tweet=re.sub(r'[^\w\s]',"",tweet)
    #remove space at the beginning
    tweet=re.sub("^\s+","",tweet)
    #remove space at the end
    tweet=re.sub("\s+$","",tweet)
    return tweet
    

In [None]:
clean_positive_tweets=[clean_text(x) for x in positive_tweets ]

In [None]:
clean_positive_tweets

In [None]:
#convert tweets to lower text 
clean_positive_tweets=[x.lower() for x in clean_positive_tweets]

In [None]:
clean_positive_tweets

In [None]:
#Do the same for negative tweets
clean_negative_tweets=[clean_text(x) for x in negative_tweets]
clean_negative_tweets=[x.lower() for x in clean_negative_tweets]
clean_negative_tweets

In [None]:
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(stop_words)

In [None]:
from nltk.stem import PorterStemmer
stemmer=PorterStemmer()

In [None]:
#process the +tweets by doing the following
#1- remove stop words
#2- pass the words through the stemmer 
def process_text(tweets):
    clean_tweets=[]
    for tweet in tweets:
        no_stop_words= [x for x in tweet.split() if x not in stop_words]
        clean_tweet=[stemmer.stem(x) for x in no_stop_words]
        clean_tweets.append(clean_tweet)
    return clean_tweets



In [None]:
#clean positive tweets
clean_positive_tweets=process_text(clean_positive_tweets)
clean_positive_tweets

In [None]:
#clean negative tweets
clean_negative_tweets=process_text(clean_negative_tweets)
clean_negative_tweets

In [None]:
total_negative_tweets=len(clean_negative_tweets)
total_negative_tweets

In [None]:
total_positive_tweets=len(clean_positive_tweets)
total_positive_tweets

In [None]:
all_tweets=[]
all_tweets.extend(clean_positive_tweets)
all_tweets.extend(clean_negative_tweets)

In [None]:
#checking if positive and negative tweets got appended by chekcing the length
len(all_tweets)


In [None]:
#creating array of positive and negative labels
positive_labels=[1]*len(clean_positive_tweets)
negative_labels=[0]*len(clean_negative_tweets)

In [None]:
labels=[]
labels.extend(positive_labels)
labels.extend(negative_labels)

In [None]:
#shuffle the data
labels
import random
zip_list = list((zip(all_tweets, labels)))
random.shuffle(zip_list)
tweets, labels = zip(*zip_list)

In [None]:
def build_freq_dict(tweets,labels):
    freq_dict=dict()
    #{"word":[count of positive,count of negative]}
    for i,tweet in enumerate(tweets):
        for word in tweet:
            if word not in freq_dict.keys():
                if labels[i]==0:
                    freq_dict[word]=[0,1]
                else:
                    freq_dict[word]=[1,0]
            else:
                if labels[i]==0:
                    freq_dict[word][1]+=1
                else:
                    freq_dict[word][0]+=1
    return freq_dict





In [None]:
freq_dict=build_freq_dict(tweets,labels)

In [None]:
freq_dict

In [None]:
def get_total_pos_neg_frequency(freq_dict):
    pos_frequency=0
    neg_frequency=0
    for key,value in freq_dict.items():
        pos_frequency+=value[0]
        neg_frequency+=value[1]
    return pos_frequency,neg_frequency


In [None]:
pos_freq,neg_freq=get_total_pos_neg_frequency(freq_dict)

In [None]:
#build probability
#P(word i |pos) and P(word i|neg)
#we will use this formula P(word i|class)=freq(word,class)+1 / Number of words in class + number of unique words
def build_probability(freq_dict,count_pos,count_neg):
    total_unique_words=len(freq_dict.keys())
    probability_dict={}
    for word, freq in freq_dict.items():
        probability_dict[word]=[((freq[1]+1)/(count_pos+total_unique_words)),((freq[0]+1)/(count_neg+total_unique_words))]
    return probability_dict


In [None]:
probability_dict=build_probability(freq_dict,pos_freq,neg_freq)

In [None]:
probability_dict

In [None]:
import numpy as np

In [None]:
#calculate naive bayes inference
#summation of log(P(w|pos)/P(w|neg) )
#also calculate log prior = log(P(pos)/p(neg))
def build_naive_inference(probability_dict,tweets,count_pos,count_negative):
    predictions=[]
    for tweet in tweets:
        result=np.log(count_pos/count_negative)
        for word in tweet:
            result+=np.log(probability_dict[word][1]/probability_dict[word][0])
        if result>=0:
            predictions.append(1)
        else:
            predictions.append(0)
    return predictions




In [None]:
predictions=build_naive_inference(probability_dict,tweets,pos_freq,neg_freq)

In [None]:
predictions

In [None]:
labels

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
acc=accuracy_score(labels,predictions)

In [None]:
#Calculate model accuracy
acc

In [None]:
#To test with new test cases we need a predict function that cleans text first and uses the inference function to get lablel
def naive_predict(test_tweet):
    #convert to lowercase
    test_tweet=test_tweet.lower()
    test_tweet=clean_text(test_tweet)
    processed_tweet=process_text([test_tweet])
    predictions=build_naive_inference(probability_dict,processed_tweet,pos_freq,neg_freq)
    return predictions[0]
    

In [None]:
naive_predict("I am very happy today")

In [None]:
naive_predict("My day was bad")