# Load and clean data from JSON

In [49]:
# Load the data in from Json file
import json
with open("sample.json", "r+") as file:
#   for line in file:
#     print(line)
  text_data = json.load(file)

In [None]:
# Create a dataframe of the text
import pandas as pd

orig_text_dataFrame = pd.DataFrame(text_data)
orig_text_dataFrame.head()

In [None]:
# get rid of unecessary values, Keep only content and author
text_data_frame = orig_text_dataFrame[["content", "author"]]
text_data_frame.head()

In [52]:
# Make author only the name

for row in text_data_frame.itertuples():
  x, content, author = row
  text_data_frame.at[x, "author"] = author.get("username")

In [None]:
text_data_frame.head()

In [54]:
# Check out how much data of each person we have
text_data_frame["author"].value_counts()

author
Whale5152     26733
El Pablo      10831
CocoADTM      10551
Bombard       10292
Incognito      1639
script         1426
DETOXEDEGG      803
sKy             581
zootron         332
xela            232
Name: count, dtype: int64

In [55]:
# Drop all the data for users who we are not training on: We are only trying for: Whale, El Pablo, CocoADTM, Bombard!

text_data_frame = text_data_frame.loc[text_data_frame['author'].isin(["Whale5152", "El Pablo", "CocoADTM", "Bombard"])]
text_data_frame
text_data_frame["author"].value_counts()

author
Whale5152    26733
El Pablo     10831
CocoADTM     10551
Bombard      10292
Name: count, dtype: int64

In [56]:
# Stratify the sample to make it have equal examples of all data
text_data_frame = text_data_frame.groupby('author', group_keys=False).apply(lambda x: x.sample(min(len(x), text_data_frame["author"].value_counts().min()))) # Equal to the lowest amount of data for a person
text_data_frame['author'].value_counts()

author
Bombard      10292
CocoADTM     10292
El Pablo     10292
Whale5152    10292
Name: count, dtype: int64

In [57]:
# Convert the authors / target labels to numbers
text_data_frame["target"] = text_data_frame.groupby("author", sort=False).ngroup()
text_data_frame["author"][20000:20001], text_data_frame["target"][20000:20001]

(22907    CocoADTM
 Name: author, dtype: object,
 22907    1
 Name: target, dtype: int64)

## KEY:
- 0 = Bombard
- 1 = CocoADTM
- 2 = El Pablo
- 3 = Whale

In [58]:
classes = ["Bombard", "Coco", "El Pablo", "Whale"] # edit this list with the names of users in data

In [59]:
# Finalize the text_data
text_data_frame = text_data_frame[["content", "target"]]
text_data_frame.head()

Unnamed: 0,content,target
39347,<@727735200221495306>,0
32164,Or 1,0
55477,https://tenor.com/view/bruh-wth-tom-cat-cat-gi...,0
4508,Why,0
33593,lol,0


# Create training and validation splits

In [60]:
from sklearn.model_selection import train_test_split

train_sentences, test_sentences, train_labels, test_labels = train_test_split(text_data_frame["content"], text_data_frame["target"], test_size=0.2, random_state=42)

len(train_sentences), len(test_sentences), len(train_labels), len(test_labels)

(32934, 8234, 32934, 8234)

In [61]:
import tensorflow
from sklearn import metrics
# Create function to evaluate all of our models
def evaluate(y_labels, y_preds):
  """
  Calculates model accuracy.
  """


  accuracy_score = metrics.accuracy_score(y_labels, y_preds) * 100

  return {"accuracy": accuracy_score}

In [62]:
# Predict func to view multiple predictions at once
def predict(model, sentences):
  pred_probs = model.predict(sentences)
  preds = tf.squeeze(tf.argmax(pred_probs, axis=1))

  for x, pred in enumerate(preds):
    print(f"Sentence:{sentences[x]} \nPrediction: {classes[pred]}\n-----------\n")

# Utilize USE model to create a model to train on

## model_1:
- USE LAYER
- 4 Dense layers

In [63]:
# Import USE layer from tensorflow Hub
import tensorflow as tf
import tensorflow_hub as hub

sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        trainable = False,
                                        input_shape = [],
                                        dtype= tf.string)

In [64]:
# Create a model using LSTM's
from tensorflow.keras import layers
import tensorflow as tf

model_1 = tf.keras.Sequential([
    sentence_encoder_layer,
    layers.Dense(128, activation="relu"),
    layers.Dense(128, activation="relu"),
    layers.Dense(128, activation="relu"),
    layers.Dense(128, activation="relu"),
    layers.Dense(4, activation="softmax")
])

In [65]:
# Compile the model
model_1.compile(loss="sparse_categorical_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [66]:
model_1.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer (KerasLayer)    (None, 512)               256797824 
                                                                 
 dense_4 (Dense)             (None, 128)               65664     
                                                                 
 dense_5 (Dense)             (None, 128)               16512     
                                                                 
 dense_6 (Dense)             (None, 128)               16512     
                                                                 
 dense_7 (Dense)             (None, 128)               16512     
                                                                 
 dense_8 (Dense)             (None, 4)                 516       
                                                                 
Total params: 256913540 (980.05 MB)
Trainable params: 

In [None]:
# Fit the model
earlyStopping = tf.keras.callbacks.EarlyStopping(monitor="val_accuracy",
                                                 patience=4)

model_1_history = model_1.fit(train_sentences,
                              train_labels,
                              epochs = 100,
                              validation_data = (test_sentences, test_labels),
                              callbacks=[earlyStopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100

In [68]:
# Evaluate the model's result
model_1_pred_probs = model_1.predict(["word"])
model_1_pred_probs[:10]



array([[0.01190012, 0.0137224 , 0.96831495, 0.00606259]], dtype=float32)

In [69]:
# Predictions
model_1_preds = tf.argmax(model_1_pred_probs, axis=1).numpy()
#model_1_preds[:10]

In [70]:
# Evaluate the predictions
print(classes[tf.squeeze(model_1_preds)])

El Pablo


In [71]:
model_1.save("model_1")

In [None]:
# Create text vectorizer
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
length = 0

for sentence in train_sentences:
  length+=len(sentence)

max_length = round(length / len(train_sentences) )

text_vectorizer = TextVectorization(max_tokens=None, #how many words in the vocab are in our train sentences (auto add <OOV> if cap is small)
                                    output_sequence_length = max_length,
                                    standardize="lower_and_strip_punctuation",
                                    split="whitespace",
                                    ngrams=None, #Create groups of n words (1 if None)
                                    output_mode="int", # How to map tokens to numbers
                                    )

# Fit it to the training text
text_vectorizer.adapt(train_sentences)

In [73]:
# Embedding layer
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=len(text_vectorizer.get_vocabulary()), # set input shape
                             output_dim=512, # Output shape
                             input_length=max_length # how long is each shape
                             )
embedding

<keras.src.layers.core.embedding.Embedding at 0x7c0aa6671e70>

In [None]:
model_2 = tf.keras.Sequential([
    layers.Input(shape=(1,), dtype=tf.string),
    text_vectorizer,
    embedding,
    layers.LSTM(32, activation="relu", return_sequences=True),
    layers.Dropout(0.3),
    layers.LSTM(32, activation="relu"),
    layers.Dense(4, activation="softmax")
])

# Compile the model
model_2.compile(loss="sparse_categorical_crossentropy",
                optimizer=tf.keras.optimizers.Adam(lr=0.001),
                metrics=["accuracy"])

# Fit the model
earlyStopping = tf.keras.callbacks.EarlyStopping(monitor="val_accuracy",
                                                 patience=4)
model_2_history = model_2.fit(train_sentences,
                              train_labels,
                              epochs = 100,
                              validation_data=(test_sentences,test_labels),
                              validation_steps = len(test_sentences),
                              callbacks=[earlyStopping])

In [75]:
# evaluate model
model_2_pred_probs = model_2.predict(test_sentences)
model_2_preds = tf.argmax(model_2_pred_probs, axis=1).numpy()

evaluate(model_2_preds, test_labels)



{'accuracy': 46.514452271071164}

In [None]:
embedding(text_vectorizer(["mexico on top"]))


## Model 3

In [None]:
# Create a model using sentence encoder and Dense layers
from tensorflow.keras import layers
import tensorflow as tf

model_3 = tf.keras.Sequential([
    layers.Input(shape=(1,), dtype=tf.string),
    text_vectorizer,
    embedding,
    layers.LSTM(128, activation="relu"),
    layers.Dropout(0.3),
    layers.Dense(64, activation="relu"),
    layers.Dense(32, activation="relu"),
    layers.Dropout(0.25),
    layers.Dense(16, activation="relu"),
    layers.Dense(4, activation="softmax")
])

In [None]:
model_3.summary()

In [None]:
# Compile the model
model_3.compile(loss="sparse_categorical_crossentropy",
                optimizer=tf.keras.optimizers.Adam(lr=0.001),
                metrics=["accuracy"])

# Fit the model
earlyStopping = tf.keras.callbacks.EarlyStopping(monitor="val_accuracy",
                                                 patience=4)

model_3_history = model_3.fit(train_sentences,
                              train_labels,
                              epochs = 100,
                              validation_data = (test_sentences, test_labels),
                              callbacks=[earlyStopping])

In [80]:
model_3.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_1 (Text  (None, 19)                0         
 Vectorization)                                                  
                                                                 
 embedding_1 (Embedding)     (None, 19, 512)           6558208   
                                                                 
 lstm_3 (LSTM)               (None, 128)               328192    
                                                                 
 dropout_3 (Dropout)         (None, 128)               0         
                                                                 
 dense_10 (Dense)            (None, 64)                8256      
                                                                 
 dense_11 (Dense)            (None, 32)                2080      
                                                      

In [None]:
# Predict
model_3_pred_probs = model_3.predict(["i hate coding"])
model_3_preds = tf.argmax(model_3_pred_probs, axis=1).numpy()
print(classes[model_3_preds.tolist()[0]])

In [82]:

model_3.save("model_3")