In [None]:
# !pip install nltk
import nltk
import numpy as np
import pandas as pd

In [None]:
# stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import re
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import string
from nltk.stem import PorterStemmer


#####**Data Acquisition**

In [None]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

In [None]:
from nltk.corpus import twitter_samples
twitter_samples.fileids()

['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']

#####**Data Exploratory**

In [None]:
# to focus just on the text field of the Tweets, which are accessed via the strings() method.
# all_tweets = twitter_samples.strings('tweets.20150430-223406.json')
all_pos_tweets = twitter_samples.strings('positive_tweets.json')
all_neg_tweets = twitter_samples.strings('negative_tweets.json')

print('Number of Positive tweets', len(all_pos_tweets))
print('Number of Negative tweets', len(all_neg_tweets))


Number of Positive tweets 5000
Number of Negative tweets 5000


#####**Data Preprocessing**

In [None]:
def process_tweets(tweet):
  stopwords_english = stopwords.words('english')
  stemmer = PorterStemmer()

  # remove stock market tickers like $GE
  tweet = re.sub(r'\$\w*', '', tweet)

  # remove old style retweet text "RT"
  tweet = re.sub(r'^RT[\s]+', '', tweet)

  # remove hyperlinks
  tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    
  # remove hashtags
  # only removing the hash # sign from the word
  tweet = re.sub(r'#', '', tweet)

  # tokenize tweets
  tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
  tweet_tokens = tokenizer.tokenize(tweet)

  tweets_clean = []   
  for word in tweet_tokens:
    if (word not in stopwords_english and # remove stopwords
        word not in string.punctuation): # remove punctuation
        #tweets_clean.append(word)
        stem_word = stemmer.stem(word) # stemming word
        tweets_clean.append(stem_word)

  return tweets_clean


In [None]:
tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np Good"

sample_tweet = all_pos_tweets[5]
print(sample_tweet)

# print cleaned tweet
print(process_tweets(tweet))

print(process_tweets(sample_tweet))


@BhaktisBanter @PallaviRuhail This one is irresistible :)
#FlipkartFashionFriday http://t.co/EbZ0L2VENM
['hello', 'great', 'day', ':)', 'good', 'morn']
['one', 'irresist', ':)', 'flipkartfashionfriday']


**Prepare the data for our model**

In [None]:
train_pos = all_pos_tweets[:4000]
test_pos = all_pos_tweets[4000:]
train_neg = all_neg_tweets[:4000]
test_neg = all_neg_tweets[4000:]

train_x = train_pos + train_neg 
test_x = test_pos + test_neg

# Combine positive and negative labels into an array for the target variable. 
# Append 1’s for positive and 0’s for negative tweets.
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [None]:
cleaned_data = []

for tweet in train_x:
  cleaned_data.append(process_tweets(tweet))

#####**Sequential Models**

In [None]:
import tensorflow as tf
from keras.models import Sequential
from tensorflow.keras.preprocessing import sequence
from keras.layers import  Dense, Embedding, LSTM, GRU
from keras.preprocessing.text import Tokenizer

In [None]:
t = Tokenizer()

t.fit_on_texts(cleaned_data)

sequences = t.texts_to_sequences(cleaned_data)

print("The sequences generated from text are : ",sequences)

The sequences generated from text are :  [[345, 220, 934, 398, 285, 51, 2], [84, 716, 1403, 559, 13, 157, 654, 1734, 3486, 346, 3487, 2, 150, 3], [286, 73, 77, 2, 1735, 149, 1044, 1736], [451, 2], [3488, 3489, 2271, 2272, 3490, 3491, 42, 588, 3492, 1737, 849, 1738, 2, 589, 14], [25, 2273, 2, 475], [11, 80, 8, 1179, 70, 124, 26, 115, 32, 58, 3493, 2], [655, 158, 206, 233, 22, 2274, 2, 39, 528, 717, 1180, 2275, 307, 399], [1045, 15, 3494, 7, 656], [850, 3495, 157, 3496, 3497, 3498, 3499, 330, 3500, 52, 172, 318, 3501, 57], [345, 220, 476, 285, 51, 2], [8, 178, 6, 1739, 6, 477, 2], [4], [416, 125, 21, 296, 70, 2], [40, 39, 434, 2276, 3502, 718, 772, 935, 3503, 1046, 12, 1047, 2], [1181, 773, 590, 1048, 1740, 34, 8, 188, 5], [1741, 657, 6, 3504, 2, 3505, 3506, 3507], [560, 3508, 3, 83, 7, 400, 1049, 27], [31, 2277, 2], [134, 41, 21, 169, 16, 2278, 173, 197, 116, 71, 2, 141, 81], [345, 220, 39, 4, 285, 51, 2], [75, 529, 851, 591, 279, 2279, 3509, 3510, 3511, 109, 1182, 2], [110, 719, 85, 53

In [None]:
max_len = max(len(x) for x in sequences)
max_len

51

In [None]:
unique_words = set(x for l in sequences for x in l)
n_unique_words = len(unique_words)
n_unique_words

9086

In [None]:
X_train = sequence.pad_sequences(sequences, maxlen=max_len, padding='post')
X_train

array([[ 345,  220,  934, ...,    0,    0,    0],
       [  84,  716, 1403, ...,    0,    0,    0],
       [ 286,   73,   77, ...,    0,    0,    0],
       ...,
       [  36,   33,  246, ...,    0,    0,    0],
       [  41,  163,   30, ...,    0,    0,    0],
       [  42,  962,  188, ...,    0,    0,    0]], dtype=int32)

In [None]:
tokenized_test = t.texts_to_sequences(test_x)
X_test = sequence.pad_sequences(tokenized_test, maxlen=max_len, padding='post')

In [None]:
model = Sequential()
model.add(Embedding(n_unique_words+1, 128, input_length=max_len))
model.add((LSTM(128)))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 51, 128)           1163136   
                                                                 
 lstm_2 (LSTM)               (None, 128)               131584    
                                                                 
 dense_2 (Dense)             (None, 1)                 129       
                                                                 
Total params: 1,294,849
Trainable params: 1,294,849
Non-trainable params: 0
_________________________________________________________________


In [None]:
history=model.fit(X_train, train_y, epochs=2)

Epoch 1/2
Epoch 2/2


In [None]:
scores = model.evaluate(X_test, test_y)
scores



[0.7518969178199768, 0.5874999761581421]

In [None]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


True

In [None]:
books_names = nltk.corpus.gutenberg.fileids()
books_names

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [None]:
nltk.corpus.gutenberg.raw("austen-emma.txt")

'[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to unite some of the best blessings\nof existence; and had lived nearly twenty-one years in the world\nwith very little to distress or vex her.\n\nShe was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister\'s marriage,\nbeen mistress of his house from a very early period.  Her mother\nhad died too long ago for her to have more than an indistinct\nremembrance of her caresses; and her place had been supplied\nby an excellent woman as governess, who had fallen little short\nof a mother in affection.\n\nSixteen years had Miss Taylor been in Mr. Woodhouse\'s family,\nless as a governess than a friend, very fond of both daughters,\nbut particularly of Emma.  Between _them_ it was more the intimacy\nof sisters.  Even before Miss Taylor had ceased to hold the nominal\noffice o

In [None]:
fiction_genre = ['burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-thursday.txt', 'melville-moby_dick.txt']