In [25]:
import json
import pandas as pd
import re

In [2]:
with open('data/review_553850.json') as f:
    data = json.load(f)



In [8]:
# print few keys
print(list(data.keys()))

# print few values in reviews
print(list(data['reviews'])[:2])
print(list(data['query_summary'])[:2])
print(list(data['cursors'])[:2])


['reviews', 'query_summary', 'cursors']
['160675747', '160675705']
['num_reviews', 'review_score']
['AoJwoLGp1Y0DcpqF3AQ=']


In [9]:
# Extract 'review' and 'votes_funny' from each review
reviews = []
votes_funny = []
for review_id, review_data in data['reviews'].items():
    reviews.append(review_data['review'])
    votes_funny.append(review_data['votes_funny'])

In [22]:

# Make pandas dataframe
df = pd.DataFrame({'review': reviews, 'votes_funny': votes_funny})
df


Unnamed: 0,review,votes_funny
0,↑→↓↓↓,0
1,"Good fun Co Op, worth every penny.",0
2,"Was a little skeptical at first, once i starte...",0
3,Very democratic,0
4,ALL FOR DEMOCRACY!!!!,0
...,...,...
189595,Can't even get past main loading screen anymor...,0
189596,"After multiple patches, the servers seem much ...",0
189597,Greattt,0
189598,everything is perfect,0


# **Cleaning**

In [23]:
def clean_text(text):
    # Remove emojis
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Convert to lower case
    text = text.lower()

    return text

In [26]:
# Apply the cleaning function to each review
df['review'] = df['review'].apply(clean_text)

In [32]:
df

Unnamed: 0,review,votes_funny
0,,0
1,good fun co op worth every penny,0
2,was a little skeptical at first once i started...,0
3,very democratic,0
4,all for democracy,0
...,...,...
189595,cant even get past main loading screen anymore...,0
189596,after multiple patches the servers seem much m...,0
189597,greattt,0
189598,everything is perfect,0


# Preprocessing

In [33]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Tokenize the text
tokenizer = Tokenizer(num_words=10000)  # Use the 10,000 most common words
tokenizer.fit_on_texts(df['review'])
sequences = tokenizer.texts_to_sequences(df['review'])

# Pad the sequences
padded_sequences = pad_sequences(sequences, maxlen=100)  # Make all reviews 100 words long

# Now 'padded_sequences' is a 2D array containing your preprocessed text data




In [34]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['votes_funny'], test_size=0.2, random_state=42)

# Build the model
model = Sequential()
model.add(Embedding(10000, 32, input_length=100))
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=128, validation_split=0.2)

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)



Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.9656118154525757


In [39]:
# check if the model is overfitting
import matplotlib.pyplot as plt

history = model.history
history

# history is empty because we did not save the history of the model


<keras.src.callbacks.History at 0x1f4d5296550>