# Steam Reviews Classifier with BERT
Here I’ll try to show how BERT handles the classification of reviews in Steam. For the BERT part, I will use [xhlulu](https://www.kaggle.com/xhlulu) code.

In [None]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
import tokenization

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import os
os.chdir('/kaggle')

In [None]:
data = pd.read_csv("/kaggle/input/steam-reviews/train.csv")
data.head()

In [None]:
data.describe()

In [None]:
data.user_suggestion.value_counts()

Most reviews are positive

In [None]:
sizes = [data.user_suggestion.value_counts()[0], data.user_suggestion.value_counts()[1]]
labels = [0, 1]

explode = (0, 0.1)
fig1, ax1 = plt.subplots()
ax1.set_title('Games recommendation')
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)

ax1.axis('equal')  
plt.tight_layout()
plt.show()

In [None]:
#data['hour_played_reviews'] = data.groupby('hour_played')['hour_played'].transform('count')
#x = data.hour_played
#y = data['hour_played_reviews']
#fig = plt.figure(figsize = (13,8))
#ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
#ax.scatter(x,y)
#ax.set_title('Dependence of the number of ratings on the duration of the game')
#ax.set_xlabel('Hours played')
#ax.set_ylabel('Number of reviews')

The graph looks ugly, but we can see here that a greater number of reviews are made by players who played only a few hours, and very few reviews where players played for really long.

In [None]:
#top_reviewed_games = data.title.value_counts()
#print('Top 10 reviewed games:\n\n{}'.format(data.title.value_counts()[:10]))

Data contain reviews from Steam's best selling games as February 2019

In [None]:
data = data.assign(y = (data.user_suggestion == 1).astype(int))
data.head(3)

In [None]:
print(len(data)/2)
data_cut = data.copy() # We will use just a small portion of data 
data_cut.tail(1)         # because BERT with a full data size will work for a very long time

In [None]:
data_cut.user_review = [str(x) for x in data_cut.user_review.values] # So that there are no problems in the tokenizer

In [None]:
import re
def clean_tweets(lst):
    # remove twitter Return handles (RT @xxx:)
    lst = np.vectorize(remove_pattern)(lst, "RT @[\w]*:")
    # remove twitter handles (@xxx)
    lst = np.vectorize(remove_pattern)(lst, "@[\w]*")
    # remove URL links (httpxxx)
    lst = np.vectorize(remove_pattern)(lst, "https?://[A-Za-z0-9./]*")
    # remove special characters, numbers, punctuations (except for #)
    lst = np.core.defchararray.replace(lst, "[^a-zA-Z#]", " ")
    # remove special characters, numbers, punctuations (except for #)
    lst = np.core.defchararray.replace(lst, "[^a-zA-Z#]", " ")  
    # remove amp with and
    lst = np.vectorize(replace_pattern)(lst, "amp", "and")      
    return lst
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)        
    return input_txt
def replace_pattern(input_txt, pattern, replace_text):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, replace_text, input_txt)        
    return input_txt
def clean_hashtags(lst):
    lst = np.vectorize(remove_pattern)(lst, "#[A-Za-z0-9]+")
    lst = np.vectorize(remove_pattern)(lst, "#[\w]*")
    return lst

In [None]:
import re
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub(r"early access review", "early access review ", phrase)
    phrase = re.sub(r"\+", " + ", phrase) 
    phrase = re.sub(r"\-", " - ", phrase)     
    phrase = re.sub(r"/10", "/10 ", phrase)     
    phrase = re.sub(r"10/", " 10/", phrase)         
    return phrase


trial = "Hey I'm Yann, how're you and how's it going ? That's interesting: I'd love to hear more about it+info"
print(decontracted(trial))

In [None]:
from textblob import TextBlob
# Define function to lemmatize each word with its POS tag
def lemmatize_with_postag(sentence):
    sent = TextBlob(sentence)
    tag_dict = {"J": 'a', 
                "N": 'n', 
                "V": 'v', 
                "R": 'r'}
    words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]    
    lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
    return " ".join(lemmatized_list)

# Lemmatize
sentence = "The striped bats are hanging on their feet for best"
lemmatize_with_postag(sentence)
#> 'The striped bat be hang on their foot for best'

In [None]:
#text2 = list(train3['user_review'].astype('str'))
text2 = clean_tweets(data_cut.user_review)
text3 = [ta.lower() for ta in text2]
text4 = [''.join([i if ord(i) < 128 else ' ' for i in t]) for t in text3]
text5 = [decontracted(u) for u in text4]
#text6 = [lemmatize_with_postag(u) for u in text5]
text5

In [None]:
text5

In [None]:
data_cut.user_review = text5
data_cut.user_review = [str(x) for x in data_cut.user_review.values] # So that there are no problems in the tokenizer

In [None]:
data_cut.user_review.head()

In [None]:
from sklearn.model_selection import train_test_split
X = data_cut.user_review
y = data_cut.y
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state = 42, test_size=0.2)
for each in [y_train, y_test]:
    print(f"y fraction = {each.mean():.4f}")

We check how identical we got the parts. If the difference is big, you need to try mixing the data with another method. However, now we have a very good ratio.

In [None]:
print('Train : {}, Test: {}'.format(len(X_train),len(X_test)))

In [None]:
#X_test = X_test[:-2] # if it's not equal
#y_test = y_test[:-2]
#X_train = X_train[:-1]
#y_train = y_train[:-1]
print('\n train X: {} \n train y: {} \n Val X: {} \n val y: {}'.format(len(X_train),len(y_train),len(X_test),len(y_test)))

In [None]:
%%time
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/2"
bert_layer = hub.KerasLayer(module_url, trainable=True)

In [None]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [None]:
def bert_encode(input_text, tokenizer, max_len = 512):
    token_input = [] 
    mask_input = []
    seg_input = []
    
    for text in input_text:
        text = tokenizer.tokenize(text)
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)      
        token_input.append(tokens + [0]*pad_len)
        mask_input.append([1]*len(input_sequence) + [0]*pad_len)
        seg_input.append([0] * max_len)
        
    return np.array(token_input), np.array(mask_input), np.array(seg_input)

In [None]:
def build_model(bert_layer, max_len = 512):
    input_word_ids = Input(shape=(max_len, ),dtype = tf.int32,name = 'input_words_ids')
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")
    
    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out2 = Dense(8, activation='relu')(clf_output)
    out = Dense(1, activation = 'sigmoid')(out)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=2e-6), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
%%time
train_input = bert_encode(X_train.values, tokenizer, max_len=160)
test_input = bert_encode(X_test.values, tokenizer, max_len=160)
train_labels = y_train.values

In [None]:
model = build_model(bert_layer, max_len=160)
model.summary()

In [None]:
%%time
train_history = model.fit(
    train_input, train_labels,
    validation_split=0.2,
    epochs=5,
    batch_size=4
)

In [None]:
model.save('/kaggle/working/model2.h5')

In [None]:
prediction = model.predict(test_input)
preds = []
for x in prediction:
    preds.append(int(x.round()))

from sklearn.metrics import accuracy_score
print("Accuracy: ", accuracy_score(preds, y_test.values))

from sklearn.metrics import f1_score
print("F1_Score: ", f1_score(preds, y_test.values, average = 'weighted'))

In [None]:
data2 = pd.read_csv("/kaggle/input/steam-reviews-test-dataset/test.csv")
data2.user_review = [str(x) for x in data2.user_review.values]

In [None]:
#text2 = list(train3['user_review'].astype('str'))
text_test2 = clean_tweets(data2.user_review)
text_test3 = [ta.lower() for ta in text_test2]
text_test4 = [''.join([i if ord(i) < 128 else ' ' for i in t]) for t in text_test3]
text_test5 = [decontracted(u) for u in text_test4]
#text6 = [lemmatize_with_postag(u) for u in text5]
text_test5

In [None]:
data2.user_review = text_test5
data2.user_review = [str(x) for x in data2.user_review.values]

In [None]:
final_test_input = bert_encode(data2.user_review.values, tokenizer, max_len=160)

In [None]:
prediction_final = model.predict(final_test_input)
preds_final = []
for x in prediction_final:
    preds_final.append(int(x.round()))

In [None]:
#preds_final.shape

In [None]:
submission = pd.DataFrame({'review_id': data2.review_id, 'user_suggestion': preds_final})
submission.to_csv("/kaggle/working/submission_bert2.csv", index = False)

Not bad for lazy model without data cleaning