In [1]:
import pandas as pd
import re

In [2]:
#first let's take a look at a general trend in the average score of reviews over time:
df=pd.read_csv("./data/preprocessed_data.csv")

#convert the dates into datetime:
df["Dates"]= pd.to_datetime(df["Dates"])

In [3]:
df.drop(["Dates", "Unnamed: 0"], axis=1, inplace=True)

In [4]:
import re #to filter symbols out
from gensim.parsing.preprocessing import remove_stopwords #to remove common words
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob

lemmatizer = WordNetLemmatizer()

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
remove_chars=re.compile('/[!@#$%^&*]/g')
def clean_text(text: str) -> str:
    """
        text: a string
        
        return: modified initial string with non ascii characters, other special characters and stop words removed.
        Words are also converted to lower case and lemmatized.
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text=remove_chars.sub(" ", text)
    text = remove_stopwords(text)
    lemmatizer.lemmatize("rocks")
    return text

In [5]:
df['Comments'] = df['Comments'].apply(clean_text)

In [6]:
def remove_non_ascii(string: str) -> str:
    return ''.join(char for char in string if ord(char) < 128)

In [7]:
df['Comments'] = df['Comments'].apply(remove_non_ascii)

In [8]:
df.head()

Unnamed: 0,Comments,Customer Service,Satisfaction Reviews,Speed Reviews,Reliability Reviews,Average Score
0,moved uk end august got virgin media broadband...,1.0,1.0,1.0,1.0,1.0
1,truly attrocious service terms broadband custo...,1.0,1.0,1.0,1.0,1.0
2,hard cancel contract. phone 2 hours t o spend ...,1.0,1.0,2.0,2.0,1.5
3,pay 350mbps package managed 250mbps upload 34 ...,1.0,1.0,3.0,2.0,1.75
4,worst customer service: -the bots ask irreleva...,1.0,1.0,3.0,2.0,1.75


In [9]:
df.drop(list(df.columns)[1:5], axis=1, inplace=True)
df['Average Score']=df['Average Score'].round()
df.head()


Unnamed: 0,Comments,Average Score
0,moved uk end august got virgin media broadband...,1.0
1,truly attrocious service terms broadband custo...,1.0
2,hard cancel contract. phone 2 hours t o spend ...,2.0
3,pay 350mbps package managed 250mbps upload 34 ...,2.0
4,worst customer service: -the bots ask irreleva...,2.0


In [10]:
import tensorflow_hub as hub
import tensorflow as tf
import numpy as np

In [11]:
from sklearn.utils import shuffle
df = shuffle(df)

In [12]:
df["Average Score"].value_counts()

2.0    2819
1.0    1227
3.0     275
4.0      21
Name: Average Score, dtype: int64

In [13]:
train, val, test = np.split(df.sample(frac=1), [int(0.8*len(df)), int(0.9 * len(df))])

In [14]:


def df_to_dataset(data_df, shuffle=True, batch_size=10):
    df=data_df.copy()
    labels=df.pop("Average Score")
    df = df["Comments"]
    data_set = tf.data.Dataset.from_tensor_slices((df, labels))#create a tf data.dataset object with data and its labels
    if shuffle:
        data_set=data_set.shuffle(buffer_size=len(data_df))#shuffle data
    data_set = data_set.batch(batch_size) #split dataset into batches of 10 and repeat process for num of epochs
    data_set = data_set.prefetch(tf.data.AUTOTUNE)
    return data_set



In [15]:


train= df_to_dataset(train)
valid= df_to_dataset(val)
test= df_to_dataset(test)



In [16]:
list(train)[0]  #tensor object converted to a list

(<tf.Tensor: shape=(10,), dtype=string, numpy=
 array([b'called virgin media problem broadband speeds i\'m paying vip bundle package 80 pound month told there\'s wrong broadband wasn\'t willing help fix problem kept telling thing wouldn\'t let speak kept talking telling don\'t understand wifi works. i\'m young adult know wifi works exactly. said said "well know wifi fix problem" i\'m running speed tests reading 40mbps download speed upload speeds 20mbps speed tried telling kept talking naturally frustrated phone listening!! spoke manager took attitude start simply trying explain. asked speak boss went phone previous woman speaking said "can speak boss please"she said "there boss" lies obviously. wanted fix speed kept saying need lan cable shouldn\'t lan cable. speeds reading 100mbps average lan cable considering package i\'m on. friend virgin pay 30-50 pounds month getting 100mbps speed lan cable. overall virgin bad customer service. talk talks customer service better that\'s saying so

In [17]:
import tensorflow_hub as hub
embedding = "https://tfhub.dev/google/nnlm-en-dim50/2"  #Token based text embedding trained on English Google News 7B corpus.


In [18]:


embedding = "https://tfhub.dev/google/nnlm-en-dim50/2"  #Token based text embedding trained on English Google News 7B corpus.
hub_layer = hub.KerasLayer(embedding, dtype=tf.string, trainable=True)





In [19]:
callback = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5),
             tf.keras.callbacks.ModelCheckpoint(filepath='saved_model/hub_model.h5', monitor='val_loss', save_best_only=True)]

In [20]:
model = tf.keras.Sequential()
model.add(hub_layer) #embeds our words
model.add(tf.keras.layers.Dense(16, activation="relu")) #dense takes input from every preceding neuron
model.add(tf.keras.layers.Dense(16, activation="relu"))       
model.add(tf.keras.layers.Dense(5, activation="softmax"))

In [21]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), 
             loss = tf.keras.losses.SparseCategoricalCrossentropy(), #binarycross entropy as binary classification problem
                metrics=["accuracy"])

In [22]:
model.evaluate(train) #evaluate performance of model without training it first
#accuracy is around 58%, loss is 0.



[1.5800484418869019, 0.2176792472600937]

In [24]:
history=model.fit(train, epochs= 50, validation_data=valid, callbacks=[callback] )

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50


In [None]:
import pickle