### Using TF-IDF for embedding and KMeans for clustering

In [86]:
import pandas as pd

data = pd.read_csv('frameIo_dataset_cleansed.csv')
data.dropna()
data = data["Query"][200:].tolist() # avoid using test data for training
data = [str(datum) for datum in data if len(str(datum)) > 1]
len(data)

2051

In [75]:
import spacy
spacy_en = spacy.load("en_core_web_sm")

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

wordlist = []
for query in data:
    wordlist += tokenize_en(query) 
    
len(wordlist)

19321

In [89]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
tf = TfidfTransformer()

X = vectorizer.fit_transform(data)
X = tf.fit_transform(X)
X

<2051x2295 sparse matrix of type '<class 'numpy.float64'>'
	with 14601 stored elements in Compressed Sparse Row format>

In [90]:
data1 = {'word': vectorizer.get_feature_names(),
        'tf-idf': X.toarray().sum(axis=0).tolist()}
df1 = pd.DataFrame(data1).sort_values(by="tf-idf", ascending=False, ignore_index=True) 
df1.head(10)

Unnamed: 0,word,tf-idf
0,the,115.493576
1,to,79.185662
2,add,65.19431
3,text,55.542097
4,change,53.431842
5,this,53.077377
6,in,50.988048
7,it,50.006069
8,and,49.93792
9,music,49.898824


In [91]:
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(TFIDF, test_size=0.1, random_state=42)

In [94]:
from sklearn.cluster import KMeans

model = KMeans(n_clusters=8, random_state=42)
model.fit(X_train)

KMeans(random_state=42)

In [103]:
# model.predict(X_test)

-1326.6812927693368

### Using SentenceTransformer for embeddings

In [44]:
import pandas as pd

data = pd.read_csv('frameIo_dataset_cleansed.csv')
data.dropna()
data = data["Query"][300:].tolist() # avoid using test data for training
data = [str(datum) for datum in data if len(str(datum)) > 1]
data[0]

':34 can you show a better b-roll shot than this one. maybe us eating?'

In [45]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

embedder = SentenceTransformer('all-MiniLM-L6-v2')
corpus_embeddings = embedder.encode(data)

num_clusters = 12
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(data[sentence_id])

for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    print(cluster[:10])
    print("")

Cluster  1
['start the video here and put on screen as well "New York\'s BEST Free View?"', 'To times square, put this on screen as well', ' flash frame ', 'in the end also flash frame (', 'flash frame', 'again flash frame', 'and again flash frame ', 'Can you Fix / Re-Edit - there is a pop-in image glitch that looks bad (text on screen for a few frames) at 00:33 - please check and edit this out.  The music also ends abruptly as well at the end.', 'Outstanding work!   I love the idea of the previous 6 edits then being re-used in this one!  Great idea, way to go.  ONE re-edit - there is a ghost frame @ 00:46 - one quick little shot that should not be there.  Can you fix?', 'He is saying 4 but on the screen is 5 ']

Cluster  2
['Zoom out to show this frame as the name is coming on screen', 'Please zoom in the above part for it to be visible properly.', 'Use some dynamic framing.. zoom into the grand ma\r\nzoom into the ma also ', 'Zoom th escreen in ', 'Zoom the above screen in and show w

In [63]:
from sklearn.metrics.pairwise import cosine_similarity

input_text = "PELOTON"
input_embedding = embedder.encode([input_text])
predicted_cluster = clustering_model.predict(input_embedding)[0]

cluster_centers = clustering_model.cluster_centers_
cluster_similarities = cosine_similarity(input_embedding, cluster_centers)[0]

print(f"The input text '{input_text}' belongs to cluster {predicted_cluster+1} with similarity {cluster_similarities[predicted_cluster]}")

The input text 'PELOTON' belongs to cluster 4 with similarity 0.40356576442718506


In [53]:
# validation dataset
valid_data = pd.read_csv('data/frameIo_dataset_cleansed.csv')
valid_data.dropna()
valid_data = valid_data["Query"][200:300].to_list()
valid_data = [str(datum) for datum in valid_data if len(str(datum)) > 1]
valid_data[0]

'Divide this caption into two parts'

In [54]:
sim_list = []

for data in valid_data:
    input_embedding = embedder.encode([data])
    predicted_cluster = clustering_model.predict(input_embedding)[0]

    cluster_centers = clustering_model.cluster_centers_
    cluster_similarities = cosine_similarity(input_embedding, cluster_centers)[0]
    sim_list.append(cluster_similarities[predicted_cluster])
    
    print(f"'{data}' belongs to cluster {predicted_cluster+1} with similarity {cluster_similarities[predicted_cluster]}")

'Divide this caption into two parts' belongs to cluster 11 with similarity 0.676762580871582
'Make sure all the captions are coming here in the red box throughout so they are visible.
Remove the black band behind it and only 5/6 words should come at one point of time.' belongs to cluster 11 with similarity 0.7327397465705872
'check the frame here' belongs to cluster 1 with similarity 0.6763469576835632
'Make sure all the captions are coming here in the red box throughout so they are visible.
Remove the black band behind it and only 5/6 words should come at one point of time.
Also make the subtitle font size smaller. ' belongs to cluster 11 with similarity 0.6632164716720581
'and the I O is ' belongs to cluster 8 with similarity 0.38930636644363403
'The text on screen behind it is getting cut ' belongs to cluster 5 with similarity 0.4778822958469391
'Of course, you did! ' belongs to cluster 7 with similarity 0.32758215069770813
'car purchase. ' belongs to cluster 8 with similarity 0.403

'Asked you to put captions above the red line, not below, it won't be visible.' belongs to cluster 11 with similarity 0.7463710308074951
'Captions missing here also. Put 1/2 lines only not 3/4 in one frame.
Sane as Video 1' belongs to cluster 1 with similarity 0.6150084733963013
'Put captions above red line ' belongs to cluster 11 with similarity 0.7564390897750854
'Maestre ' belongs to cluster 4 with similarity 0.3876780569553375
'Hi Aditya, font for the captions can they be like this please? https://www.instagram.com/p/CjlFXLDB4O5/ this is a great reference for what i am looking for. thank you ' belongs to cluster 11 with similarity 0.5322926044464111
'For this clip we should focus on one question that has the most engagement. In the video it is time stamp 4:43-6:52. It can be condensed to share the most interesting parts to keep it at 60 seconds.' belongs to cluster 1 with similarity 0.4293155372142792
'captions like this : https://www.instagram.com/p/CjlFXLDB4O5/' belongs to cluste

In [55]:
min(sim_list)

0.21051472

In [None]:
import random

demo_for_few_shot = []

for cluster in clustered_sentences:
    demo = random.sample(cluster, 3)
    demo_for_few_shot.append(demo)