<a href="https://colab.research.google.com/github/BrendaGilisho/ml-predict/blob/main/sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np # for math and arrays
import pandas as pd # data from for the data.

In [None]:
filename = '/content/drive/My Drive/ML/test.csv'
data = pd.read_csv(filename)

In [None]:
data.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [None]:
import re
import string
from os import listdir
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
from numpy import array

In [None]:
def load_doc(filename):
    '''Load the file and return the text of the given a filename'''
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [None]:
def clean_doc(doc):
    '''Remove non-alpha chars, punctuation, and stopwords'''
    # split into tokens by white space
    tokens = doc.split()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

In [None]:
def doc_to_line(filename, vocab):
    '''load doc, clean and return line of tokens'''
    # load the doc
    doc = load_doc(filename)
    # clean doc
    tokens = clean_doc(doc)
    # filter by vocab
    tokens = [w for w in tokens if w in vocab]
    return ' '.join(tokens)

In [None]:
def process_docs(directory, vocab, is_train):
    '''load all docs in a directory'''
    lines = list()
    # walk through all files in the folder
    for filename in listdir(directory):
    # skip any reviews in the test set
        if is_train and filename.startswith('tweet'):
            continue
        if not is_train and not filename.startswith('tweet'):
            continue
        # create the full path of the file to open
        path = directory + '/' + filename
        # load and clean the doc
        line = doc_to_line(path, vocab)
        # add to list
        lines.append(line)
    return lines

In [None]:
def load_clean_dataset(vocab, is_train):
    '''Load and clean a dataset'''
    neg = process_docs('/content/drive/My Drive/ML/twitter', vocab, is_train)
    pos = process_docs('/content/drive/My Drive/ML/twitter', vocab, is_train)
    docs = neg + pos
    # prepare labels
    labels = array([0 for _ in range(len(neg))] + [1 for _ in range(len(pos))])
    return docs, labels

In [None]:
def define_model(n_words):
    '''Define Network'''
    model = Sequential()
    model.add(Dense(50, input_shape=(n_words,), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # compile network
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
def evaluate_model(Xtrain, ytrain, Xtest, ytest):
    '''Evaluate neural network model '''
    scores = list()
    n_repeats = 10
    n_words = Xtest.shape[1]
    for i in range(n_repeats):
        # define network
        model = define_model(n_words)
        # fit network
        model.fit(Xtrain, ytrain, epochs=10, verbose=0)
        # evaluate
        _, acc = model.evaluate(Xtest, ytest, verbose=0)
        scores.append(acc)
        print('%d accuracy: %s' % ((i+1), acc))
    return 

In [None]:
def prepare_data(train_docs, test_docs, mode):
    '''Prepare bag of words encoding of docs'''
    # create the tokenizer
    tokenizer = Tokenizer()
    # fit the tokenizer on the documents
    tokenizer.fit_on_texts(train_docs)
    # encode training data set
    Xtrain = tokenizer.texts_to_matrix(train_docs, mode=mode)
    # encode training data set
    Xtest = tokenizer.texts_to_matrix(test_docs, mode=mode)
    return Xtrain, Xtest

In [None]:
def predict_sentiment(review, vocab, tokenizer, model):
    #clean
    tokens = clean_doc(review)
    #filter by vocab
    tokens = [w for w in tokens if w in vocab]
    #convert to a line
    line = ' '.join(tokens)
    #encode
    encoded = tokenizer.texts_to_matrix([line], mode='binary')

    yhat = model.predict(encoded, verbose=0)

    percent_pos = yhat[0,0]
    if round(percent_pos) == 0:
        return (1-percent_pos), 'Depression'
    return (percent_pos), 'No Depression'

In [None]:
def create_tokenizer(lines):
    '''fit a tokenizer'''
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer  

In [None]:
def add_doc_to_vocab(filename, vocab):
    #load doc
    doc = load_doc(filename)
    tokens = clean_doc(doc)
    vocab.update(tokens)

In [None]:
def save_list(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [None]:
# load all docs in a directory
def process_docs_to_vocab(directory, vocab):
    # walk through all files in the folder
    for filename in listdir(directory):
        # skip files that do not have the right extension
        if not filename.endswith(".txt"):
            continue
        # create the full path of the file to open
        path = directory + '/' + filename
    # add doc to vocab
        add_doc_to_vocab(path, vocab)

In [None]:
vocab_filename = '/content/drive/My Drive/ML/twitter/test.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
train_docs, ytrain = load_clean_dataset(vocab,True)
test_docs, ytest = load_clean_dataset(vocab, False)

In [None]:
tokenizer = create_tokenizer(train_docs)


In [None]:
import pickle

In [None]:
# saving the tokenizer for predict function later
pickle.dump(tokenizer, open('tokenizer1.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([train_docs])[0]
sequence_data[:10]

[]

In [None]:
#encode data
Xtrain = tokenizer.texts_to_matrix(train_docs, mode='binary')
Xtest = tokenizer.texts_to_matrix(test_docs, mode='binary')

In [None]:
#define network
n_words = Xtrain.shape[1]
model = define_model(n_words)

In [None]:
#fit network
model.fit(Xtrain, ytrain, epochs=10, verbose=2)

Epoch 1/10
1/1 - 0s - loss: 0.7527 - accuracy: 0.5000 - 462ms/epoch - 462ms/step
Epoch 2/10
1/1 - 0s - loss: 5.5806 - accuracy: 0.5000 - 11ms/epoch - 11ms/step
Epoch 3/10
1/1 - 0s - loss: 2.4841 - accuracy: 0.5000 - 10ms/epoch - 10ms/step
Epoch 4/10
1/1 - 0s - loss: 1.1041 - accuracy: 0.5000 - 11ms/epoch - 11ms/step
Epoch 5/10
1/1 - 0s - loss: 0.8080 - accuracy: 0.5000 - 15ms/epoch - 15ms/step
Epoch 6/10
1/1 - 0s - loss: 1.9269 - accuracy: 0.5000 - 14ms/epoch - 14ms/step
Epoch 7/10
1/1 - 0s - loss: 1.3830 - accuracy: 0.5000 - 12ms/epoch - 12ms/step
Epoch 8/10
1/1 - 0s - loss: 1.0263 - accuracy: 0.5000 - 11ms/epoch - 11ms/step
Epoch 9/10
1/1 - 0s - loss: 1.5420 - accuracy: 0.5000 - 11ms/epoch - 11ms/step
Epoch 10/10
1/1 - 0s - loss: 0.7198 - accuracy: 0.5000 - 12ms/epoch - 12ms/step


<keras.callbacks.History at 0x7f426e600a60>

In [None]:
model.save( 'model.h5' )

In [None]:
#test 
text = 'Feeling sad'
percent, sentiment = predict_sentiment(text, vocab, tokenizer, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))
#test 
text = 'Joy'
percent, sentiment = predict_sentiment(text, vocab, tokenizer, model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))

Review: [Feeling sad]
Sentiment: Depression (50.062%)
Review: [Joy]
Sentiment: Depression (50.537%)
