# Classification of Google Play ratings

## Libraries

In [1]:
import pandas
import tensorflow as tf
import re
import nltk
import numpy

from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

nltk.download('stopwords')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\320\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Load the data


* To be able to load the data, the `csv` file must be uploaded in the same execution environment (Sidebar> Files> Upload)

In [2]:
dataframe = pandas.read_csv('https://raw.githubusercontent.com/Ciroye/sentiment-analysis-google-play-reviews/master/googleplaystore_user_reviews.csv')

In [3]:
dataframe.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


In [4]:
dataframe.tail()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
64290,Houzz Interior Design Ideas,,,,
64291,Houzz Interior Design Ideas,,,,
64292,Houzz Interior Design Ideas,,,,
64293,Houzz Interior Design Ideas,,,,
64294,Houzz Interior Design Ideas,,,,


## Pre-process the data

Before preparing the data to be suitable for the input of the neural network, those rows containing `NaN` will be deleted.

In [4]:
dataframe = dataframe.dropna()
dataframe = dataframe[['Translated_Review','Sentiment']]
dataframe.head()

Unnamed: 0,Translated_Review,Sentiment
0,I like eat delicious food. That's I'm cooking ...,Positive
1,This help eating healthy exercise regular basis,Positive
3,Works great especially going grocery store,Positive
4,Best idea us,Positive
5,Best way,Positive


From the columns that interest us, the following steps must be performed to be able to unify all the sentences:
- Put all the letters in lower case.
- Remove punctuation marks, converting all the words that were together into separate words.
- Delete * stop words *: Short words that have no meaning in themselves, such as conjunctions or prepositions.

In [8]:
stopwords = stopwords.words('english')
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [9]:
def preprocess_text(sen):
    # Eliminar símbolos de puntuación y números
    sentence = re.sub('[^a-zA-Z]', ' ', sen)

    # Eliminar carácteres sueltos
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Eliminar espacios excesivos
    sentence = re.sub(r'\s+', ' ', sentence)

    # Convertir a minúscula
    sentence = sentence.lower()
  
    # Eliminar las stopwords.
    words = sentence.split()
    filtered_words = [word for word in words if word not in stopwords]
    
    return ' '.join(filtered_words)

In [10]:
dataframe['Translated_Review'] = dataframe['Translated_Review'].apply(lambda sen: preprocess_text(sen))
dataframe

Unnamed: 0,Translated_Review,Sentiment
0,like eat delicious food cooking food case best...,Positive
1,help eating healthy exercise regular basis,Positive
3,works great especially going grocery store,Positive
4,best idea us,Positive
5,best way,Positive
...,...,...
64222,ads older many agents much owner posted detail...,Positive
64223,photos posted portal load fit purpose sure sto...,Positive
64226,dumb app wanted post property rent give option...,Negative
64227,property business got link sms happy performan...,Positive


## Vectorize the data:

This section will aim to convert a text string into a vector, so that it can be treated by the neural network. First, in order to generate the training and validation sets, we will separate the dataframe into two independent variables so that we can treat them separately:

In [11]:
def determine_class(label):
    if label == 'Positive':
        return 0
    elif label == 'Neutral':
        return 1
    elif label == 'Negative':
        return 2

# Poner la variable a True si se quieren eliminar los comentarios neutrales.
# En caso contrario, los neutrales se convertirán a negativos
REMOVE_NEUTRAL = False

# Poner la variable a True (siempre que la anterior valga False) para considerar
# las clases 'Negative' y 'Neutral' iguales.
MERGE_NEGATIVE_NEUTRAL = False

if REMOVE_NEUTRAL:
    indexNames = dataframe[dataframe['Sentiment'] == 'Neutral'].index
    dataframe.drop(indexNames , inplace=True)
    y = dataframe['Sentiment'].apply(lambda x: 1 if x == 'Positive' else 0).to_numpy()
else:
    if MERGE_NEGATIVE_NEUTRAL:
        y = dataframe['Sentiment'].apply(lambda x: 1 if x == 'Positive' else 0).to_numpy()
    else: 
        y = dataframe['Sentiment'].apply(lambda x: determine_class(x)).to_numpy()

X = dataframe['Translated_Review']
y = y.astype(numpy.uint8)

From the `sklearn` library we can separate into two disjoint sets, containing 80% of the samples for the training set and the rest for the validation set.

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [18]:
y_train

array([0, 0, 2, ..., 0, 2, 0], dtype=uint8)

The next step is to determine the maximum number of words to be used (that is, the first most frequent $ n $ words) and the maximum length of each vector.

In [13]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

ModuleNotFoundError: No module named 'keras'

In [16]:
from tensorflow.preprocessing.text import Tokenizer
from tensorflow.preprocessing.sequence import pad_sequences

ModuleNotFoundError: No module named 'tensorflow.preprocessing'

In [None]:
NUMBER_OF_WORDS = 20000
MAX_LEN = 100

tokenizer = Tokenizer(num_words = NUMBER_OF_WORDS)
tokenizer.fit_on_texts(X_train)

X_train_original = X_train
X_test_original = X_test
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

X_train = pad_sequences(X_train, padding='post', maxlen=MAX_LEN)
X_test = pad_sequences(X_test, padding='post', maxlen=MAX_LEN)

Then, for those phrases that have words that are not the most frequent, the padding technique is performed by filling in those words that are not as frequent and do not appear with zeros.


## Create the RNN

In [None]:
VOCABULARY_SIZE = NUMBER_OF_WORDS
EMBEDDING_SIZE = 128

In [None]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(VOCABULARY_SIZE, 
                                    EMBEDDING_SIZE, 
                                    input_shape=(X_train.shape[1],)))

model.add(tf.keras.layers.LSTM(units=128, activation='tanh'))


#En units, se debe especificar cuantas clases tenemos. 
model.add(tf.keras.layers.Dense(units=numpy.unique(y_train).shape[0], 
                                activation='sigmoid'))

model.compile(optimizer='rmsprop', 
                loss='sparse_categorical_crossentropy', 
                metrics=['sparse_categorical_accuracy'])
  
model.summary()

model.fit(X_train, y_train, epochs=30, batch_size=128)

In [77]:
test_loss, test_acurracy = model.evaluate(X_test, y_test)
print("Test accuracy: {}".format(test_acurracy))

Test accuracy: 0.914373517036438


In [81]:
X_test_original.head()

31799    great game heats phone short time please recti...
5707                                   maths formulas want
62076    suggestions improvement change throttle adjust...
18833          notifications work cellphone otherwise like
11216                           helps speak polish friends
Name: Translated_Review, dtype: object

In [78]:
model.predict(X_test[0].reshape(1, X_test[0].shape[0]))

array([[0.5797347 , 0.00081079, 0.00173335]], dtype=float32)

In [85]:
model.predict_classes(X_test[0].reshape(1, X_test[0].shape[0]))

array([0])

In [86]:
test = 'This app is really bad'
test = tokenizer.texts_to_sequences(test)
test = pad_sequences(test, padding='post', maxlen=MAX_LEN)
print(model.predict(test[1].reshape(1, test[1].shape[0])))
model.predict_classes(test[1].reshape(1, test[1].shape[0]))

[[0.01242092 0.39561883 0.00515131]]


array([1])