In [None]:
import pandas as pd
events_df = pd.read_csv("https://api.mockaroo.com/api/3f045400?count=1000&key=9aadb790")
data = events_df['tags'].tolist()

In [None]:
data = [d.strip("{}") for d in data]

In [None]:
data

['Nature  Pop_Culture Beauty Music Literature Religion',
 'Photography  Religion Finance Environment Finance Social_Media',
 'Education  Pop_Culture Literature Entertainment Art',
 'Charity  Health Science Education Parenting Food Literature Human_Rights',
 'Environment  Human_Rights Gaming Fashion Charity Fitness Business Marketing Social_Media',
 'Gardening ',
 'Religion  Technology Education Gaming Nature Social_Media Education Travel Fitness',
 'Environment  Travel DIY Fashion Fashion Fashion Education Social_Media',
 'Environment  Nature Literature Music Photography DIY Health',
 'Religion  Sports',
 'Pop_Culture  Business Religion Environment Food Marketing',
 'Fashion  Parenting Gaming Science Education Entertainment Entertainment',
 'Photography  Gardening Literature Nature Health Politics Health Health Politics Social_Media Pop_Culture',
 'DIY  Pop_Culture Charity Parenting Entertainment',
 'Sports  Entertainment Gardening',
 'Health  Fitness Social_Media Education Beauty Fash

Things to tweak:

- min_df
  - only considers matches that occurs n times in document.
- ngram_range
  - specify tuple of ngram. ngram checks n consecutive words. (ex: (1,10))
- stop_words
  - list of ignored words. specify a list of strings or a language in string. (ex: 'english')
- min_df_multiplier
  - min_df starts from length of data then gradually decreases by multiplier.

## Train Data

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to get the top n recommendations for a given document index
def get_recommendations_tfidf(new_data, data, top_n=1, min_df_multiplier=.4):
    copied_data = data.copy()
    copied_data.append(new_data)
    # Create the vectorizer
    try_min_df = len(copied_data)
    while True:
      try:
        vectorizer = TfidfVectorizer(stop_words='english',ngram_range=(1,10),min_df=try_min_df,analyzer='word')
        # Fit and transform the data
        X = vectorizer.fit_transform(copied_data)
        break
      except Exception as e:
        try_min_df = int(try_min_df * min_df_multiplier)
    # Compute the pairwise cosine similarity
    cosine_sim = cosine_similarity(X)
    # Get the pairwise cosine similarity scores for the document
    sim_scores = list(enumerate(cosine_sim[-1]))
    # exclude the new data from sim score
    sim_scores = sim_scores[:-1]
    # Sort the documents by similarity score in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the top n recommendations
    top_docs = sim_scores[:top_n+1]
    # Return the indices of the recommended documents
    return [[copied_data[i],i,j] for i, j in top_docs]

recs = get_recommendations_tfidf('{Gaming  Art Charity Art Politics Art Business}', data, top_n=5)
recs

[['{Art  Sports}', 165, 0.8661109319480946],
 ['{Art }', 280, 0.8661109319480946],
 ['{Art  Education}', 987, 0.8661109319480946],
 ['{Art  Gaming Education}', 240, 0.8165844069778043],
 ['{Art  Education Art Fitness}', 952, 0.7786823247416116],
 ['{Environment  Art Charity Art Health}', 908, 0.763730826113787]]

In [None]:
import tensorflow as tf

# Function to get the top n recommendations for a given document index
def get_recommendations_tfidf(new_data, data, top_n=1, min_df_multiplier=0.4, output_mode='tf-idf'):
    copied_data = data.copy()
    copied_data.append(new_data)

    # Create the TextVectorization layer
    vectorizer = tf.keras.layers.experimental.preprocessing.TextVectorization(
        max_tokens=None,  # Default value, no maximum limit on vocabulary size
        output_mode=output_mode,
        output_sequence_length=None,  # Default value, variable length sequences
        pad_to_max_tokens=False,  # Don't pad sequences
        standardize='lower_and_strip_punctuation',  # Disable standardization
        split='whitespace',  # Split text by whitespace
    )

    # Fit and transform the data
    vectorizer.adapt(copied_data)
    X = vectorizer(copied_data)

    # Compute the pairwise cosine similarity
    cosine_sim = tf.linalg.matmul(X, X, transpose_b=True)

    # Get the pairwise cosine similarity scores for the document
    sim_scores = list(enumerate(cosine_sim[-1, :-1]))

    # Sort the documents by similarity score in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top n recommendations
    top_docs = sim_scores[:top_n]

    # Return the indices and recommendations
    return [[copied_data[i], i, float(j)] for i, j in top_docs]

def loop_print(arr,contains):
  for el in arr:
    print(el)
  print(len(set(i[0] for i in arr) & contains))
  print()

def check_contains(raw_string):
  words = raw_string.replace("{",'').replace("}",'').split()
  output = set()
  for datum in data:
    cont = False
    for word in words:
      if word not in datum:
        cont = True
        break
    if cont:
      continue
    output.add(datum)
  for i in output:
    print(i)
  print()
  return output

query = '{Politics  Social_Media}'

output = check_contains(query)

recs = get_recommendations_tfidf(query, data, top_n=10)
loop_print(recs, output)
recs = get_recommendations_tfidf(query, data, top_n=10, output_mode='int')
loop_print(recs, output)
recs = get_recommendations_tfidf(query, data, top_n=10, output_mode='count')
loop_print(recs, output)
recs = get_recommendations_tfidf(query, data, top_n=10, output_mode='multi_hot')
loop_print(recs, output)

{Nature  Social_Media Politics DIY Fitness Parenting Environment Crafts Environment Technology Parenting}
{Politics  DIY Marketing Beauty Social_Media Entertainment Crafts Parenting Politics Travel}
{Social_Media  Politics Food Photography Beauty Science Beauty Gaming Sports}
{Charity  Environment Politics Crafts Social_Media}
{Politics  Music Social_Media Health Health Charity Fitness}
{DIY  Literature Gardening Art Politics Technology Social_Media Gaming Travel Gardening Marketing}
{Environment  Politics Business Fashion Crafts Social_Media Food Food}
{Gardening  Human_Rights Politics Technology Social_Media Art Marketing Marketing Technology}
{Technology  Gardening Business Politics Food Nature Fashion Social_Media}
{Art  Social_Media Fashion Human_Rights Education Photography Technology Charity Gaming Entertainment Politics}
{Entertainment  Politics Environment Sports Photography Social_Media Fitness}
{Parenting  Art Art Social_Media Travel Politics Beauty}
{Nature  Politics Social

# Advantages of Using TFIDF
- Accounts unavailable tag combination
- Penalties for each tag miss. If all tags are matched but aforementioned events have many more general tags, it will score/match less.


In [None]:
newdata = data.copy()
newdata = [i.replace('{','').replace('}','').split(',') for i in newdata]
predict = '{Music,Business,Art,Education,Art,Crafts,Social Media}'
predict = predict.replace('{','').replace('}','').split(',')
count = [len([i for i in predict if i in j]) for j in newdata]
match_list = [i for i, x in enumerate(count) if x == max(count)]
[data[i] for i in match_list]

['{Music,Business,Art,Education,Art,Crafts,Social Media,Marketing,Art,Sports,Health}']

In [None]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to train and save the TfidfVectorizer and cosine similarity model
def train_tfidf(data, save_path, ngram_range=(1, 1), stop_words=None, min_df_multiplier=.4):
    # Create the vectorizer
    try_min_df = len(data)
    while True:
      try:
        vectorizer = TfidfVectorizer(stop_words=stop_words,ngram_range=ngram_range,min_df=try_min_df,analyzer='word')
        # Fit and transform the data
        X = vectorizer.fit_transform(data)
        break
      except Exception as e:
        try_min_df = int(try_min_df * min_df_multiplier)
    # Compute the pairwise cosine similarity
    cosine_sim = cosine_similarity(X)
    # Save the vectorizer and cosine similarity matrix to disk
    with open(save_path, 'wb') as f:
        pickle.dump((vectorizer, cosine_sim), f)

# Function to load the TfidfVectorizer and cosine similarity model from disk
def load_tfidf(save_path):
    with open(save_path, 'rb') as f:
        vectorizer, cosine_sim = pickle.load(f)
    return vectorizer, cosine_sim

# Function to get the top n recommendations for a given document index
def get_recommendations_tfidf(new_data, vectorizer, cosine_sim, data, top_n=1):
    copied_data = data.copy()
    copied_data.append(new_data)
    # Transform the new data using the pre-trained vectorizer
    X_new = vectorizer.transform(copied_data)
    # Compute the pairwise cosine similarity between the new data and the training data
    cosine_sim_new = cosine_similarity(X_new)
    # Get the pairwise cosine similarity scores for the document
    sim_scores = list(enumerate(cosine_sim_new[-1]))
    # exclude the new data from sim score
    sim_scores = sim_scores[:-1]
    # Sort the documents by similarity score in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the top n recommendations
    top_docs = sim_scores[:top_n]
    # Return the indices of the recommended documents
    return [(copied_data[i], i, j) for i, j in top_docs]

# Train and save the model
train_tfidf(data, 'tfidf_model.pkl', ngram_range=(1, 10), stop_words='english')

# Load the model
vectorizer, cosine_sim = load_tfidf('tfidf_model.pkl')

# Use the model to make recommendations
recs = get_recommendations_tfidf('Business Music', vectorizer, cosine_sim, data, top_n=5)
for i in recs:
    print(i)


('{Business}', 52, 1.0)
('{Business}', 59, 1.0)
('{Business}', 356, 1.0)
('{Business}', 469, 1.0)
('{Business}', 788, 1.0)


## Export Model

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Tokenize the train and test data
tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='- ',lower=True)
tokenizer.fit_on_texts(train_data)
train_sequences = tokenizer.texts_to_sequences(train_data)
test_sequences = tokenizer.texts_to_sequences(test_data)

# Pad the train and test sequences
max_len = max([len(seq) for seq in train_sequences + test_sequences])
train_padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=max_len, padding='post')
test_padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen=max_len, padding='post')

# Define the TensorFlow model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=1, input_length=max_len),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(1024, activation='relu'),
    tf.keras.layers.Dense(len(data), activation='softmax')
])

optimizer = tf.keras.optimizers.Adagrad(learning_rate=0.5, initial_accumulator_value=0.1)

# Compile the model
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(train_padded_sequences, np.arange(len(train_data)), epochs=10)

# Evaluate the model on the test set
train_loss, train_accuracy = model.evaluate(train_padded_sequences, np.arange(len(train_data)))
print("Train loss:", train_loss)
print("Train accuracy:", train_accuracy)
test_loss, test_accuracy = model.evaluate(test_padded_sequences, np.arange(len(test_data)))
print("Test loss:", test_loss)
print("Test accuracy:", test_accuracy)


In [None]:
recs = get_recommendations_tf('Beauty DIY Education', data, top_n=5)
for i in recs:
  print(i)