In [1]:
import weaviate
client = weaviate.Client("http://localhost:8080")
client.schema.get()

{'classes': []}

In [2]:
!pip install -U sentence-transformers




In [3]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def get_embedding(text):
    embeddings = model.encode(sentences)
    return embeddings


In [14]:
class_obj = {
    "class": "Video_text"
}
client.schema.create_class(class_obj)

In [15]:
class_obj1 = {
    "class": "Video_text_description"
}
client.schema.create_class(class_obj1)

In [18]:
class_obj2 = {
    "class": "Video_description"
}
client.schema.create_class(class_obj2)

In [19]:
import os
import json
input_directory = 'output_data'

In [20]:
json_files = [f for f in os.listdir(input_directory) if f.endswith('.json')]


In [21]:
json_files

['2016-02-02_2000_US_KNBC_4_News_at_Noon.v4.json',
 '2016-02-02_1800_US_HLN_The_Daily_Share.v4.json',
 '2016-02-02_1500_US_KABC_Good_Morning_America.v4.json',
 '2016-02-02_2000_US_CNN_Newsroom.v4.json',
 '2016-02-02_1800_US_CNN_Wolf.v4.json']

In [34]:
for file_name in json_files:
    input_file = os.path.join(input_directory, file_name)
    with open(input_file) as f:
        data = json.load(f)  
        metadata = data['metadata']['file']
        video_id = data['metadata']['text_id'][3:]
        with client.batch(batch_size=100) as batch:
            
            for sent in data['sentences']:
                
                embedding_video_text = model.encode(sent['sentence'])
                properties_text = {
                   "text": sent['sentence'],
                   "starttime" : sent['starttime'],
                   "endtime" : sent['endtime'],
                   "metadata" : metadata,
                   "video_id" : video_id
                }
                
                client.batch.add_data_object(
                    properties_text,
                    "Video_text",
                    vector = embedding_video_text
                )
                                
                combined_text = "In the video you can hear: " + sent['sentence'] + " In the video you can see: " + ", ".join([sentence.strip(" .") for sentence in sent['frame_data']]) + '.'
                embedding_video_text_desc = model.encode(combined_text)
                properties_video_text_desc = {
                   "text": combined_text,
                   "starttime" : sent['starttime'],
                   "endtime" : sent['endtime'],
                   "metadata" : metadata,
                   "video_id" : video_id
                }
                client.batch.add_data_object(
                    properties_video_text_desc,
                    "Video_text_description",
                     vector = embedding_video_text_desc
                )
                
                video_desc = ", ".join([sentence.strip(" .") for sentence in sent['frame_data']]) + '.'
                embedding_video_desc = model.encode(video_desc)
                properties_video_desc = {
                   "text": video_desc,
                   "starttime" : sent['starttime'],
                   "endtime" : sent['endtime'],
                   "metadata" : metadata,
                   "video_id" : video_id
                }
                client.batch.add_data_object(
                    properties_video_desc,
                    "Video_description",
                     vector = embedding_video_desc
                )
    print("file done")

file done
file done
file done
file done
file done


In [36]:
client.query.aggregate("Video_text_description").with_meta_count().do()

{'data': {'Aggregate': {'Video_text_description': [{'meta': {'count': 3057}}]}}}

In [37]:
## Vector Similarity Search

text_search_input = "Ted Cruz scores a huge victory"
image_search_input = "a group of people taking photos"
combined_text = "In the video you can hear: " + text_search_input + "In the video you can see: " + image_search_input
vector = model.encode(combined_text)

response = (
    client.query
    .get("Video_text_description", ["text", "starttime", "endtime", "metadata","video_id"])
    .with_near_vector({
        "vector" : vector
    })
    .with_limit(5)
    .with_additional(["distance"])
    .do()
)
print(json.dumps(response, indent=4))


{
    "data": {
        "Get": {
            "Video_text_description": [
                {
                    "_additional": {
                        "distance": 0.11348927
                    },
                    "endtime": "8.66",
                    "metadata": "/db/tv/2016/2016-02/2016-02-02/2016-02-02_1500_US_KABC_Good_Morning_America.txt",
                    "starttime": "5.96",
                    "text": "In the video you can hear: Ted Cruz scores a huge victory over Donald Trump . In the video you can see: a bunch of people that are talking to each other, a crowd of people taking pictures of a man in a suit, a group of people taking pictures with cell phones.",
                    "video_id": "4b9b3cd8_c9c6_11e5_b5ec_089e01ba0335"
                },
                {
                    "_additional": {
                        "distance": 0.22137702
                    },
                    "endtime": "214.87",
                    "metadata": "/db/tv/2016/2016-02/2016-02