In [312]:
# run the commands below in terminal to install dependencies
# pip install spacy
# python -m spacy download en
import spacy
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import string

nlp_sw = English()
nlp_n = spacy.load("en_core_web_sm")

removable_char = list(string.punctuation) + ["\n", "\r"]

# text arguement in string
# remove stop words of the provided text
# return a list of words in small letters in the text without stop words and punctuations
def remove_stop_words(text):
    text = text.lower()
    token_doc = nlp_sw(text)
    clean_text = []
    for token in token_doc:
        token_text = token.text
        token_text = token_text.replace(" ", "")
        lexeme = nlp_sw.vocab[token_text]
        print (lexeme.is_stop)
        if not(lexeme.is_stop or token_text in removable_char):
            if token_text and not(token_text.isspace()):
                tmp_text = token_text[1:] if token_text[0] in removable_char else token_text
                if tmp_text and not(tmp_text.isspace()):
                    clean_text.append(tmp_text)
    return clean_text

# text arguement in string
# normalise the text
# return a list of words in small letters in the normalised text
def normalise_text(text):
    text = text.lower()
    token_doc = nlp_n(text)
    normalised_text = []
    for token in token_doc:
        lemma = token.lemma_
        if not(lemma == "-PRON-"):
            normalised_text.append(lemma)
    return normalised_text

In [313]:
text = "Social media platforms are becoming an integral part of people’s life. They reflect the user’s personal life. People like to share happiness, joy, and sadness on social media. These platforms are used for researchers to identify the causes of depression and detect it."

normalise_text(" ".join(remove_stop_words("not happy")))

True
False


['happy']

In [314]:
remove_stop_words("not happy")

True
False


['happy']

In [210]:
# reading the csv file
import pandas as pd
df = pd.read_csv('training.1600000.processed.noemoticon.csv',encoding = "ISO-8859-1")
df

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
...,...,...,...,...,...,...
1599994,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599995,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599996,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599997,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


In [211]:
#checking the strucuture of the dataset
for key in df :
    print(key)


0
1467810369
Mon Apr 06 22:19:45 PDT 2009
NO_QUERY
_TheSpecialOne_
@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D


In [212]:
# removing unwanted columns of dataset
df = df[['0',"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D"]]
df

Unnamed: 0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew
...,...,...
1599994,4,Just woke up. Having no school is the best fee...
1599995,4,TheWDB.com - Very cool to hear old Walt interv...
1599996,4,Are you ready for your MoJo Makeover? Ask me f...
1599997,4,Happy 38th Birthday to my boo of alll time!!! ...


In [213]:
#change name of Columns
df.columns=['Label','Sentences']

In [214]:
#save file to csv
df.to_csv('training_data.csv',index=False)

In [215]:
import pandas as pd
df = pd.read_csv('training_data.csv')


In [216]:
df

Unnamed: 0,Label,Sentences
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew
...,...,...
1599994,4,Just woke up. Having no school is the best fee...
1599995,4,TheWDB.com - Very cool to hear old Walt interv...
1599996,4,Are you ready for your MoJo Makeover? Ask me f...
1599997,4,Happy 38th Birthday to my boo of alll time!!! ...


In [217]:
# 0 = negative, 2 = neutral, 4 = positive
data_negative = df[df.Label == 0][:10000]
data_positive = df[df.Label == 4][:10000]
data_positive["Label"] = 1
df_row_merged = pd.concat([data_negative, data_positive], ignore_index=True)
df_row_merged

Unnamed: 0,Label,Sentences
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew
...,...,...
19995,1,Morning! I have slacked for two days in twitte...
19996,1,@bensummers Isn't that sweet of them.... Altru...
19997,1,"@jakrose Um, milk *fathers* don't have udders...."
19998,1,@zenaweist They could also tweet @BeccaRoberts


In [228]:
df_row_merged = df_row_merged.sample(frac=1).reset_index(drop=True)
df_row_merged

Unnamed: 0,Label,Sentences
0,0,itch gamble
1,0,want home
2,0,nt know m uk nt jealous
3,1,omg miley cyrus say good morning america wakes...
4,1,sound cool find let knowplease
...,...,...
19995,1,excited finally website run
19996,0,End of the night and work week. Wahoo!!! Long...
19997,1,sweet begun break news world go end aha oh shi...
19998,1,@jonconnelly What can I say?!


In [219]:
training_data = df_row_merged[:int(len(df_row_merged)*0.7)]
test_data = df_row_merged[int(len(df_row_merged)*0.7):]
training_data


Unnamed: 0,Label,Sentences
0,1,@cactopus I always eat :S It's the high metabo...
1,1,"More tears, just watched Tru Confessions. It'..."
2,1,I like saving money
3,0,Just woke up. I need more sleep. Funeral at 12
4,0,Ecg and doctor appointment this morning. 9 hou...
...,...,...
13995,1,im glad you had a good time i wntd to do some...
13996,1,@sk8mate It's today!! See you soon
13997,0,@Midgley LOL yeah it has. It's now raining ju...
13998,1,@philbolsta nice to see someone in the TC is u...


In [220]:
test_data

Unnamed: 0,Label,Sentences
14000,1,Cool! Now I have my own leaping theme music! ...
14001,1,Graham Coxon on Britpop: &quot;once the Guardi...
14002,1,"@ragsmadison Ongina, perhaps? (contestant in R..."
14003,0,@Steinsgrrl Sarah and I think it looks like so...
14004,0,Someone gave me a biscotti that tastes like it...
...,...,...
19995,0,i need a pick me up. probably should be tackli...
19996,0,@greggrunberg hey you said matt was gonna go a...
19997,1,"well in that case, you know what i mean, hahah..."
19998,1,@Hayvock good luck man i hope you get the jop


In [221]:
# remove url in tweets
# remove tag (@) in tweets
# remove hashtags (#) in tweets
# remove stopwords
# normalise text
# this cell might take some time to run, be patient
import re
import string

url_pattern = re.compile(".*https?:\/\/")
hashtag_pattern = re.compile("#[a-z\d-]+")
tag_pattern = re.compile("^@")
punc = string.punctuation
punc = punc.replace("-", "")
punc += "’“”"

def process_text(text):
    tmp_no_url_hashtag = []
    for word in re.split(' |\r|\n', text):
        if url_pattern.match(word) or tag_pattern.match(word) or hashtag_pattern.match(word):
            continue
        else:
            clean_word = word.translate(str.maketrans("","", punc))
            tmp_no_url_hashtag.append(clean_word)
    tmp = " ".join(tmp_no_url_hashtag)
    without_stopwords = " ".join(remove_stop_words(tmp))
    processed = " ".join(normalise_text(without_stopwords))
    return processed

for i in range (len(training_data["Sentences"])):
    training_data["Sentences"][i] = process_text(training_data["Sentences"][i])

In [222]:
training_data

Unnamed: 0,Label,Sentences
0,1,eat s high metabolism oh haha
1,1,tears watch tru confession probably favourite ...
2,1,like save money
3,0,woke need sleep funeral 12
4,0,ecg doctor appointment morning 9 hour sleep sick
...,...,...
13995,1,m glad good time wntd nice u r u
13996,1,today soon
13997,0,lol yeah rain think u like know btw nt ur ipho...
13998,1,nice tc


In [226]:
validation_data = training_data[:2000]
training_data = training_data[2000:]

In [232]:
train_x_val = np.asarray(training_data["Sentences"])
train_y_val = np.asarray(training_data["Label"])
valid_x_val = np.asarray(validation_data["Sentences"])
valid_y_val = np.asarray(validation_data["Label"])

In [245]:
from __future__ import absolute_import, division, print_function, unicode_literals
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
embedding = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
# embedding = "https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1"
hub_layer = hub.KerasLayer(embedding, input_shape=[], 
                           dtype=tf.string, trainable=True)
# hub_layer(training_data["Sentences"][:3])

In [267]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(10, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer_7 (KerasLayer)   (None, 20)                400020    
_________________________________________________________________
dense_28 (Dense)             (None, 16)                336       
_________________________________________________________________
dense_29 (Dense)             (None, 10)                170       
_________________________________________________________________
dense_30 (Dense)             (None, 1)                 11        
Total params: 400,537
Trainable params: 400,537
Non-trainable params: 0
_________________________________________________________________


In [268]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [269]:
history = model.fit(train_x_val,
                    train_y_val,
                    epochs=20,
                    validation_data=(valid_x_val, valid_y_val),
                    verbose=1)

Train on 12000 samples, validate on 2000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [270]:
test_x_val = np.asarray(test_data["Sentences"])
test_y_val = np.asarray(test_data["Label"])

In [271]:
results = model.evaluate(test_x_val, test_y_val)



In [243]:
print(results)

[1.2196894877751667, 0.5975]


In [307]:
predict = model.predict(["not happy"])

In [308]:
predict[0]

array([0.99999964], dtype=float32)

In [302]:
test_data["Sentences"][14003]

'@Steinsgrrl Sarah and I think it looks like something from a horror movie--SO creepy   Im emailing you right now!'

In [303]:
test_data["Label"][14003]

0

In [304]:
process_text("I am not happy")

'happy'