In [2]:
import nltk, re, string, random
from nltk.tag import pos_tag
from nltk.corpus import twitter_samples, stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import FreqDist, classify, NaiveBayesClassifier
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('twitter_samples')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\15-6-2023\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\15-6-2023\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\15-6-2023\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\15-6-2023\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\15-6-2023\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\15-6-2023\AppData\Roaming\nltk_data.

True

In [3]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

text = twitter_samples.strings('tweets.20150430-223406.json')
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')

In [4]:
len(positive_tweets), len(negative_tweets)

(5000, 5000)

In [5]:
positive_tweets[500]

'Dang that is some rad @AbzuGame #fanart! :D https://t.co/bI8k8tb9ht'

In [6]:
positive_tweets[0], tweet_tokens[0]

('#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)',
 ['#FollowFriday',
  '@France_Inte',
  '@PKuchly57',
  '@Milipol_Paris',
  'for',
  'being',
  'top',
  'engaged',
  'members',
  'in',
  'my',
  'community',
  'this',
  'week',
  ':)'])

In [7]:
print(pos_tag(tweet_tokens[0]))

[('#FollowFriday', 'JJ'), ('@France_Inte', 'NNP'), ('@PKuchly57', 'NNP'), ('@Milipol_Paris', 'NNP'), ('for', 'IN'), ('being', 'VBG'), ('top', 'JJ'), ('engaged', 'VBN'), ('members', 'NNS'), ('in', 'IN'), ('my', 'PRP$'), ('community', 'NN'), ('this', 'DT'), ('week', 'NN'), (':)', 'NN')]


# Cleaning Data

In [8]:
def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []

    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

In [9]:
tweet_tokens[0], lemmatize_sentence(tweet_tokens[0])

(['#FollowFriday',
  '@France_Inte',
  '@PKuchly57',
  '@Milipol_Paris',
  'for',
  'being',
  'top',
  'engaged',
  'members',
  'in',
  'my',
  'community',
  'this',
  'week',
  ':)'],
 ['#FollowFriday',
  '@France_Inte',
  '@PKuchly57',
  '@Milipol_Paris',
  'for',
  'be',
  'top',
  'engage',
  'member',
  'in',
  'my',
  'community',
  'this',
  'week',
  ':)'])

In [10]:
def remove_noise(tweet_tokens, stop_words = () ):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):

        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                      '(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', token)

        token = re.sub('(@[A-Za-z0-9_]+)0', '', token)

        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()

        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:

            cleaned_tokens.append(token.lower())

    return cleaned_tokens

  token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\


In [11]:
stopwords = stopwords.words('english')

In [12]:
remove_noise(tweet_tokens[0], stopwords)

['#followfriday',
 '@france_inte',
 '@pkuchly57',
 '@milipol_paris',
 'top',
 'engage',
 'member',
 'community',
 'week',
 ':)']

In [13]:
tweet_tokens[0]

['#FollowFriday',
 '@France_Inte',
 '@PKuchly57',
 '@Milipol_Paris',
 'for',
 'being',
 'top',
 'engaged',
 'members',
 'in',
 'my',
 'community',
 'this',
 'week',
 ':)']

In [14]:
remove_noise(tweet_tokens[300], stopwords)

['stats', 'day', 'arrive', '2', 'new', 'follower', 'unfollowers', ':)', 'via']

In [15]:
tweet_tokens[300]

['Stats',
 'for',
 'the',
 'day',
 'have',
 'arrived',
 '.',
 '2',
 'new',
 'followers',
 'and',
 'NO',
 'unfollowers',
 ':)',
 'via',
 'http://t.co/xxlXs6xYwe',
 '.']

In [16]:
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

In [17]:
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for token in positive_tweet_tokens:

    positive_cleaned_tokens_list.append(remove_noise(token, stopwords))

for token in negative_tweet_tokens:
    
    negative_cleaned_tokens_list.append(remove_noise(token, stopwords))

In [18]:
positive_tweet_tokens[500], positive_cleaned_tokens_list[500]

(['Dang',
  'that',
  'is',
  'some',
  'rad',
  '@AbzuGame',
  '#fanart',
  '!',
  ':D',
  'https://t.co/bI8k8tb9ht'],
 ['dang', 'rad', '@abzugame', '#fanart', ':d'])

In [19]:
def get_all_words(cleaned_tokens_list):

    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

all_pos_token = get_all_words(positive_cleaned_tokens_list)

In [20]:
all_pos_token

<generator object get_all_words at 0x000001D04EE60AD0>

In [21]:
feq_dist_pos = FreqDist(all_pos_token)

In [22]:
print(feq_dist_pos.most_common(20))

[(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 388), ('follow', 357), ('love', 333), ('...', 290), ('good', 283), ('get', 263), ('thank', 253), ('u', 245), ('day', 242), ('like', 229), ('see', 195), ('happy', 192), ("i'm", 183), ('great', 175), ('hi', 173), ('go', 167), ('back', 163)]


In [23]:
def get_tweets_for_model(cleaned_tokens_list):

    for tweet_tokens in cleaned_tokens_list:
        yield dict( [token, True] for token in tweet_tokens)

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

In [24]:
positive_tokens_for_model

<generator object get_tweets_for_model at 0x000001D04F5A5620>

In [25]:
positive_dataset = [(tweet_dict, "Positive")
                   for tweet_dict in positive_tokens_for_model]

negative_dataset = [(tweet_dict, "Negative")
                   for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset

In [26]:
dataset

[({'#followfriday': True,
   '@france_inte': True,
   '@pkuchly57': True,
   '@milipol_paris': True,
   'top': True,
   'engage': True,
   'member': True,
   'community': True,
   'week': True,
   ':)': True},
  'Positive'),
 ({'@lamb2ja': True,
   'hey': True,
   'james': True,
   'odd': True,
   ':/': True,
   'please': True,
   'call': True,
   'contact': True,
   'centre': True,
   '02392441234': True,
   'able': True,
   'assist': True,
   ':)': True,
   'many': True,
   'thanks': True},
  'Positive'),
 ({'@despiteofficial': True,
   'listen': True,
   'last': True,
   'night': True,
   ':)': True,
   'bleed': True,
   'amazing': True,
   'track': True,
   'scotland': True},
  'Positive'),
 ({'@97sides': True, 'congrats': True, ':)': True}, 'Positive'),
 ({'yeaaaah': True,
   'yippppy': True,
   'accnt': True,
   'verify': True,
   'rqst': True,
   'succeed': True,
   'get': True,
   'blue': True,
   'tick': True,
   'mark': True,
   'fb': True,
   'profile': True,
   ':)': True,


In [27]:
random.shuffle(dataset)

In [28]:
dataset

[({'@comicbookhes': True,
   'course': True,
   'would': True,
   'like': True,
   ':)': True,
   'x': True},
  'Positive'),
 ({'♛': True,
   '》': True,
   'love': True,
   'much': True,
   'beli̇eve': True,
   'wi̇ll': True,
   'follow': True,
   'please': True,
   'justi̇n': True,
   '@justinbieber': True,
   ':(': True,
   'x15': True,
   '339': True,
   'ｓｅｅ': True,
   'ｍｅ': True},
  'Negative'),
 ({'#ff': True,
   '#happyfriday': True,
   '@sagevatic': True,
   '@marchicristian': True,
   '@bounceroriginal': True,
   '@beenfingered': True,
   '@dj_mando_off': True,
   'great': True,
   '#friday': True,
   ':-)': True},
  'Positive'),
 ({'achieving': True,
   'excellence': True,
   'music': True,
   'producer': True,
   'mind': True,
   'boggling': True,
   'fatiguing': True,
   'job': True,
   '...': True,
   'baareeq': True,
   ':)': True},
  'Positive'),
 ({'@karinatuano': True,
   'shiiiit': True,
   'kamiss': True,
   'beh': True,
   '😭': True,
   'thank': True,
   'bby': True

In [29]:
train_data = dataset[:7000]
test_data = dataset[7000:]

# Bulding Model

In [30]:
classifir = NaiveBayesClassifier.train(train_data)

In [31]:
classify.accuracy(classifir, test_data)

0.9946666666666667

In [32]:
classifir.show_most_informative_features(20)

Most Informative Features
                      :( = True           Negati : Positi =   2056.2 : 1.0
                      :) = True           Positi : Negati =   1654.1 : 1.0
                follower = True           Positi : Negati =     39.6 : 1.0
                  arrive = True           Positi : Negati =     36.9 : 1.0
                     bam = True           Positi : Negati =     23.3 : 1.0
                followed = True           Negati : Positi =     23.3 : 1.0
                     sad = True           Negati : Positi =     22.9 : 1.0
                 awesome = True           Positi : Negati =     19.3 : 1.0
                    poor = True           Negati : Positi =     15.4 : 1.0
                     via = True           Positi : Negati =     15.2 : 1.0
                 welcome = True           Positi : Negati =     14.1 : 1.0
                    blog = True           Positi : Negati =     13.9 : 1.0
               community = True           Positi : Negati =     13.2 : 1.0

In [33]:
random_tweet = 'Thank you for sending my baggage to CityX and flying me to CityY at the same time. Brilliant service. #thanksGenericAirline'


cleaning = remove_noise(word_tokenize(random_tweet))

print(cleaning)

print(classifir.classify(dict([token, True] for token in cleaning)))


['thank', 'you', 'for', 'send', 'my', 'baggage', 'to', 'cityx', 'and', 'fly', 'me', 'to', 'cityy', 'at', 'the', 'same', 'time', 'brilliant', 'service', 'thanksgenericairline']
Positive


In [35]:
import pickle

# Save the trained Naive Bayes classifier
with open("sentiment_model.pkl", "wb") as file:
    pickle.dump(classifir, file)

print("Model saved successfully as 'sentiment_model.pkl'")



Model saved successfully as 'sentiment_model.pkl'


In [36]:
with open("sentiment_model.pkl", "rb") as file:
    loaded_model = pickle.load(file)