<a href="https://colab.research.google.com/github/BenjaminDKLuong/Colab_Notes/blob/master/Detect_Mean_Tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **DECTECT MEAN TWEETS**

In [13]:
!git clone https://github.com/PacktPublishing/Real-World-Python-Deep-Learning-Projects.git

Cloning into 'Real-World-Python-Deep-Learning-Projects'...
remote: Enumerating objects: 13503, done.[K
remote: Total 13503 (delta 0), reused 0 (delta 0), pack-reused 13503[K
Receiving objects: 100% (13503/13503), 178.97 MiB | 31.05 MiB/s, done.
Resolving deltas: 100% (85/85), done.
Checking out files: 100% (13472/13472), done.


In [0]:
# change working directory to new location
import os
os.chdir("/content/Real-World-Python-Deep-Learning-Projects/Section 3 Code/source")


In [15]:
# check current working directory
%pwd

'/content/Real-World-Python-Deep-Learning-Projects/Section 3 Code/source'

In [16]:
!ls

conf.py  mean_or_not.py  prep.py      stopwords.py
data	 models		 __pycache__  train.py


## Import Stopwords

In [0]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords=set(stopwords.words('english'))


## Process Data

In [0]:
from os import listdir
from os import path

from stopwords import stopwords as exclude

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from pprint import pprint

def gen_x(xtext, tokenizer, max_len=None, for_training=False, ):
    print("Let's tokenize!")
    # We "fit" our tokenizer on our training set.
    # This is where unique numbers are generated for each word.
    if for_training:
        tokenizer.fit_on_texts(xtext)

    # Encode words(tokens) as unique numbers.
    encoded_xtext = tokenizer.texts_to_sequences(xtext)

    # We're looking for the longest sentence
    # in our training set.
    # Then we will use it when we ran gen_x on test data.
    # The key here is to have maximum lenght all the same trougout
    # training and data sets.
    if not max_len:
        max_len = max([len(s.split()) for s in xtext])
        tokenizer._max_padding_len=max_len

    # We need to pad our encoded text to the maximum lenght
    # for our embedding layer to work properly.
    train_x = pad_sequences(encoded_xtext, maxlen=max_len, padding='post')

    if for_training:
        return train_x, max_len
    return train_x

def cleanup(w, clean_sw=True):
    """
    Return a word if it's significant
    and None if it can be filtered out.
    clean_sw - should we filter out stop words?
    """
    w=w.strip().lower()
    if not w.isalpha():
        return None
    if clean_sw and w in exclude:
        return None
    if len(w) == 1:
        return None
    return w

def clean(data, clean_sw):
    """
    Remove unnecessary words and characters
    from a data set.
    data - a list of sentences to clean
    clean_sw - should we filter out stop words?
    """
    out=[]
    for doc in data:
        wout=[]
        for w in doc.split():
            w=cleanup(w, clean_sw)
            if w == None:
                continue
            wout.append(w)
        out.append(' '.join(wout))
    return out

def get_data(d='data/txt_sentoken', do_cleanup=True, filter_stopwords=True):
    """
    Load all our data into memory,
    split into training and data sets,
    clean up and encode, so we can use it
    with our neural network.
    do_cleanup - should we remove insignificant characters and words?
    filter_stopwords - should we remove common words?
    """
    train_x=[]
    train_y=[]

    test_x=[]
    test_y=[]

    # First, load all of the data into train_x.
    print('Loading data...')
    for p in ['neg', 'pos']:
        for filename in listdir(path.join(d,p)):
            dfile = path.join(d,p,filename)
            data=open(dfile).read()
            train_x.append(data)

    if do_cleanup:
        print('Doing cleanup...')
        ct=clean(train_x, filter_stopwords)
    else:
        ct=train_x

    # Split our data set as training and test set.
    # We have 1000 positive and 100 negative reviews.
    l=1000
    # We split our data into 90% of data for training set
    # and we leave 10% for testing.
    trainl=int(l*0.90)
    testl=int(l*0.10)

    # First, spliting training set.
    # Negative first.
    train_x_neg=ct[0:trainl]
    train_x_pos=ct[l:l+trainl]

    # Generate approriate labels for negative data.
    # 0 means negative, 1 positive.
    train_y_neg=[ 0 for i in range(len(train_x_neg))]
    train_y_pos=[ 1 for i in range(len(train_x_pos))]

    # Put all of training splits together.
    train_x=train_x_neg+train_x_pos
    train_y=train_y_neg+train_y_pos

    # Get the remining 10% of data as test set.
    test_x_neg=ct[trainl:l]
    test_x_pos=ct[l+trainl:]

    test_y_neg=[ 0 for i in range(len(test_x_neg))]
    test_y_pos=[ 1 for i in range(len(test_x_neg))]

    test_x=test_x_neg+test_x_pos
    test_y=test_y_neg+test_y_pos

    # Create a new tokenizer, we will use it for both
    # training and test data.
    tokenizer=Tokenizer()
    # Encode and pad our train and test data.
    input_train_x=train_x
    train_x, max_len=gen_x(train_x, tokenizer, for_training=True)
    test_x=gen_x(test_x, tokenizer, max_len=max_len)

    # Just show a sample of input text and encoded text.
    print('Output from tokenizer:')
    pprint(input_train_x[0][:50])
    pprint(train_x[0][:9])
    for w in input_train_x[0][:50].replace(':','').split():
        if w in tokenizer.word_index.keys():
            print(w, '=', tokenizer.word_index[w])
    print()


    # Get a vocabulary size (a number of unique words).
    # We will later have to use it for our Embedding layer.
    inputs = len(tokenizer.word_index) + 1
    print('Vocab size:')
    print(inputs)
    return train_x, train_y, test_x, test_y, inputs, max_len, tokenizer


if __name__ == '__main__':
    train_x, train_y, test_x, test_y, inputs, max_len, t=get_data()
    print('X[0]', train_x[0])
    print('Y[0]', train_y[0])

## Set Up Model

In [0]:
import conf

from keras.models import Sequential
from keras.layers import Embedding, Dense, Flatten
from keras.layers.convolutional import Conv1D, MaxPooling1D

import math
import os
import sys
import pandas
import pickle

from prep import get_data

def get_text_cnn(inputs, max_length, dim=25):
    """
    input - vocabulary size, a number of unique words in
            our data set
    max_lenght - the maximum number of words in our data set
    dim - word embedding dimension, the lenght of word vector
          that will be produced by this layer
    """
    print('CNN: inputs: %d, word embeddings dimesions: %d, input_length: %d' % (inputs, dim, max_length))
    model = Sequential()
    model.add(Embedding(inputs, dim, input_length=max_length))
    # Extract feature maps/most common "phrases".
    model.add(Conv1D(filters=32, kernel_size=5, activation='relu', padding='same'))
    # Pick up the "best ones", pooling=reducting.
    model.add(MaxPooling1D(pool_size=4))
    # Just put everything together into one vector.
    model.add(Flatten())
    # This is the standard output for classification.
    # It matches our two classes 0 and 1.
    model.add(Dense(1, activation='sigmoid'))
    return model

confs={'default': dict(model=get_text_cnn)}

def train_model(name, train_x, train_y, epochs, batches, inputs, max_lenght, test_x, test_y):
    """
    Compile and train model with choosen parameters.
    """
    mparams=confs[name]
    model=mparams['model']
    model=model(inputs, max_lenght)
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # Fit model on training data, validate during training on test data.
    model.fit(train_x, train_y, validation_data=(test_x, test_y), epochs=epochs, batch_size=batches, verbose=2)
    return model, name, mparams

def get_params(script='train.py'):
    """
    Get command line parameters.
    """
    try:
        name, epochs, batches=sys.argv[1:4]
    except ValueError:
        print('Usage: %s model_name epochs batch_size' % sys.argv[0])
        exit(1)
    return name, int(epochs), int(batches)

if __name__ == '__main__':
    # Getting our command line parameters
    name, epochs, batches=get_params()
    train_x, train_y, test_x, test_y, inputs, max_length, t=get_data(do_cleanup=True, filter_stopwords=True)
    print('Train/Test Data lenght', len(train_x), len(test_x))
    model, name, mp =train_model(name, train_x, train_y, epochs, batches, inputs, max_length, test_x, test_y)
    # Save model to use for classification later on
    mname='models/model-%s-%d-%d' % (name, epochs, batches)
    model.save(mname+'.h5')
    with open(mname+'-tokenizer.pickle', 'wb') as ts:
        pickle.dump(t, ts)
    title='%s (epochs=%d, batch_size=%d)' % (name, epochs, batches)
    # Test our model on both data that has been seen
    # (training data set) and unseen (test data set)
    print('Evaluation for %s' % title)
    loss, acc = model.evaluate(train_x, train_y, verbose=2)
    print('Train Accuracy: %.2f%%' % (acc*100))
    loss, acc = model.evaluate(test_x, test_y, verbose=2)
    print('Test Accuracy: %.2f%%' % (acc*100))

## Predict

In [0]:
rom train import get_params, confs
from prep import clean

from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import os
import pickle
import sys

if __name__ == '__main__':
    name, epochs, batches=get_params()
    model=confs[name]
    mname='models/model-%s-%d-%d' % (name, epochs, batches)
    model_file=mname+'.h5'
    tokenizer_file=mname+'-tokenizer.pickle'
    # Loading the model.
    if os.path.exists(model_file):
        model=load_model(model_file)
        print('Model loaded!')
    else:
        print("Can't find %s model, train it first using 'train.py %s %d %d'" % (mname, name, epochs, batches))
    # Loading tokenizer.
    # We need to use the same tokenizer that we've used
    # for training and testing to get the same encoding
    # for known words in our vocabulary and also in
    # the word embedding that we've created during the training.
    if os.path.exists(tokenizer_file):
        tokenizer=pickle.load(open( mname+'-tokenizer.pickle', "rb" ))
        print('Tokenizer loaded!')
    else:
        print("Can't find tokenizer for %s model, train it first using 'train.py %s %d %d'" % (mname, name, epochs, batches))
    # Get the tweet.
    print("Type in one tweet per line and hit CTRT-D when you're done:")
    for tweet in sys.stdin.readlines():
        # Cleanup the tweet before we use our model.
        t=clean([tweet], True)
        # Encode and pad our tweet with the same tokenizer
        # that we've used for training and testing.
        # We've set our own variable in
        # tokenizer._max_padding_len on training to store
        # informations about the maximum lenght of our encoded text.
        t=tokenizer.texts_to_sequences(t)
        t=pad_sequences(t, maxlen=tokenizer._max_padding_len, padding='post')
        # Get one of a predicted classes
        # In our case it's 0 for negative tweet and 1 for positive.
        pc=model.predict_classes(t)
        pc=pc[0][0]
        # We can also can get the probablity of prediction been in a given class.
        # By default we get the probablity of being in class no. 1 which in our
        # case is probability of a tweet to be postive.
        # We can get the probablity of tweet being mean just by calculating 1-prob.
        prob=model.predict_proba(t)
        prob=prob[0][0]
        print('%s -%smean (%.2f%%)' % (tweet.rstrip(), (' ' if pc==0 else ' not '),(1-prob)*100))