In [None]:
# libraries for topic modeling
import pandas as pd
import numpy as np
import gensim
from gensim import models
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
import re
from collections import defaultdict 
from numpy import dot
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
# connecting to SQL database
import mysql.connector
from mysql.connector import Error
pd.set_option('display.max_colwidth', None)

try:
    connection = mysql.connector.connect(host='dbnewyorkcartoon.cgyqzvdc98df.us-east-2.rds.amazonaws.com',
                                         database='new_york_cartoon',
                                         user='dbuser',
                                         password='Sql123456')
    if connection.is_connected():
        db_Info = connection.get_server_info()
        print("Connected to MySQL Server version ", db_Info)
        cursor = connection.cursor()
        cursor.execute("select database();")
        record = cursor.fetchone()
        print("You succeed to connect to database: ", record)

except Error as e:
    print("Error while connecting to MySQL", e)

In [None]:
# pulling down data from SQL database via search
sql_select_Query = "select caption,ranking from result;"  # you can change query in this line for selecting your target data
cursor.execute(sql_select_Query)

# show attributes names of target data
num_attr = len(cursor.description)
attr_names = [i[0] for i in cursor.description]
print(attr_names)

# get all records
records = cursor.fetchall()
print("Total number of rows in table: ", cursor.rowcount)
df = pd.DataFrame(records, columns=attr_names)
df

In [None]:
# remove unneccessary columns, axis = 1 means to remove vertical axis(columns)
df = df.drop(columns=['ranking'], axis=1)

df.head()

In [None]:
# Remove punctuation lowercasing and creating new column "caption_processed"
df['caption'] = df['caption'].astype(str)
df['caption_processed'] = df['caption'].map(lambda x: re.sub(r'[,\.\!\?\'\"]', '', x).lower())
df['caption_processed'] = df['caption_processed'].map(lambda x: re.sub(r'[--]', ' ', x).lower())

# Print out the first rows of captions
df.head()

In [None]:
# tokenizing and clean up text
data = df.caption_processed.values.tolist()

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  

data_words = list(sent_to_words(data))

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold = fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [None]:
# loading stopwords from Spacy
en = spacy.load('en_core_web_sm')
stop_words = en.Defaults.stop_words

In [None]:
# Define functions for stopwords, bigrams, trigrams
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [None]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

In [None]:
documents = data_words_bigrams

In [None]:
# creating document ids
def tagged_document(documents):
    for i, words in enumerate(documents):
        yield gensim.models.doc2vec.TaggedDocument(words, [i])

tagged_documents = list(tagged_document(documents))

# Print the first TaggedDocument
print(tagged_documents[0])

In [None]:
# for i, tagged_doc in enumerate(tagged_documents):
    # words = tagged_doc.words
    # print(f"Words in Document {i}: {words}")

In [None]:
# Doc2Vec model
model = gensim.models.doc2vec.Doc2Vec(vector_size = 200, 
                                      window = 10,
                                      min_count = 5, 
                                      dm = 1,
                                      dbow_words = 0,
                                      epochs = 15,
                                      workers = 6)

In [None]:
# building the model vocabulary
model.build_vocab(tagged_documents)

In [None]:
# training the model
model.train(tagged_documents, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
# saving the caption vectors to a compressed numpy file
def get_document_vectors_with_ids(model, tagged_docs):
    document_vectors = []
    for i, (doc_id, dv) in enumerate(zip(tagged_docs, model.dv)):
        words = doc_id.words
        document_vectors.append((f"Document {i + 1}", words, dv))
    return document_vectors

document_vectors_with_ids = get_document_vectors_with_ids(model, tagged_documents)

dtype = [('doc_id', 'U20'), ('words', object), ('doc_vector', np.float32, (model.vector_size,))]
data = np.array(document_vectors_with_ids, dtype=dtype)

# Save the data as an NPZ file
np.savez("caption_vectors.npz", data=data)

print("Document IDs, vectors, and words saved to 'caption_vectors.npz'.")