In [1]:
import pandas as pd

In [65]:
movies = pd.read_csv('../data/MovieSummaries/movie.metadata.tsv', sep='\t', on_bad_lines='skip')
movies.columns = ['movie_id', 'meta1', 'movie_name', 'release_date', 'meta2', 'meta3', 'language', 'country', 'genres']
movies = movies.drop(columns = ['meta1', 'meta2', 'meta3'])

In [66]:
import ast

def extract_genres(genre_string):
    try:
        # Parse the string to a dictionary
        genre_dict = ast.literal_eval(genre_string)
        # Extract and join the genre names
        genres = ', '.join(genre_dict.values())
        return genres
    except:
        # Return an empty string or some default value if parsing fails
        return ''

movies['language'] = movies['language'].apply(extract_genres)
movies['country'] = movies['country'].apply(extract_genres)
movies['genres'] = movies['genres'].apply(extract_genres)

movies['language'] = movies['language'].str.replace('Language', '', regex=False)

In [67]:
movies

Unnamed: 0,movie_id,movie_name,release_date,language,country,genres
0,3196793,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,English,United States of America,"Mystery, Biographical film, Drama, Crime Drama"
1,28463795,Brun bitter,1988,Norwegian,Norway,"Crime Fiction, Drama"
2,9363483,White Of The Eye,1987,English,United Kingdom,"Thriller, Erotic thriller, Psychological thriller"
3,261236,A Woman in Flames,1983,German,Germany,Drama
4,13696889,The Gangsters,1913-05-29,"Silent film, English",United States of America,"Short Film, Silent film, Indie, Black-and-whit..."
...,...,...,...,...,...,...
81735,35228177,Mermaids: The Body Found,2011-03-19,English,United States of America,Drama
81736,34980460,Knuckle,2011-01-21,English,"Ireland, United Kingdom","Biographical film, Drama, Documentary"
81737,9971909,Another Nice Mess,1972-09-22,English,United States of America,"Satire, Comedy"
81738,913762,The Super Dimension Fortress Macross II: Lover...,1992-05-21,Japanese,Japan,"Science Fiction, Japanese Movies, Adventure, A..."


In [68]:
plot_sum = pd.read_csv('../data/MovieSummaries/plot_summaries.txt', sep='\t', encoding='utf-8')
plot_sum.columns = ['movie_id', 'summaries']
plot_sum

Unnamed: 0,movie_id,summaries
0,31186339,The nation of Panem consists of a wealthy Capi...
1,20663735,Poovalli Induchoodan is sentenced for six yea...
2,2231378,"The Lemon Drop Kid , a New York City swindler,..."
3,595909,Seventh-day Adventist Church pastor Michael Ch...
4,5272176,The president is on his way to give a speech. ...
...,...,...
42297,34808485,"The story is about Reema , a young Muslim scho..."
42298,1096473,"In 1928 Hollywood, director Leo Andreyev look..."
42299,35102018,American Luthier focuses on Randy Parsons’ tra...
42300,8628195,"Abdur Rehman Khan , a middle-aged dry fruit se..."


In [69]:
# name_clus = pd.read_csv('../data/MovieSummaries/name.clusters.txt', sep='\t', encoding='utf-8')
# name_clus.columns = ['cluster', 'meta1']
# name_clus

In [70]:
# character = pd.read_csv('../data/MovieSummaries/character.metadata.tsv', sep='\t', encoding='utf-8')
# character.columns = ['movie_id', 'meta1', 'date', '']
# character

In [71]:
df = movies.merge(plot_sum)
df

Unnamed: 0,movie_id,movie_name,release_date,language,country,genres,summaries
0,9363483,White Of The Eye,1987,English,United Kingdom,"Thriller, Erotic thriller, Psychological thriller",A series of murders of rich young women throug...
1,261236,A Woman in Flames,1983,German,Germany,Drama,"Eva, an upper class housewife, becomes frustra..."
2,18998739,The Sorcerer's Apprentice,2002,English,South Africa,"Family Film, Fantasy, Adventure, World cinema","Every hundred years, the evil Morgana returns..."
3,6631279,Little city,1997-04-04,English,United States of America,"Romantic comedy, Ensemble Film, Comedy-drama, ...","Adam, a San Francisco-based artist who works a..."
4,171005,Henry V,1989-11-08,English,United Kingdom,"Costume drama, War film, Epic, Period piece, D...",{{Plot|dateAct 1Act 2Act 3Act 4Act 5 Finally n...
...,...,...,...,...,...,...,...
42197,23851782,The Ghost Train,1941-05-03,English,United Kingdom,"Crime Fiction, Thriller, Comedy, Supernatural",{{plot}} The film opens with a Great Western e...
42198,35228177,Mermaids: The Body Found,2011-03-19,English,United States of America,Drama,Two former National Oceanic Atmospheric Admini...
42199,34980460,Knuckle,2011-01-21,English,"Ireland, United Kingdom","Biographical film, Drama, Documentary",{{No plot}} This film follows 12 years in the ...
42200,913762,The Super Dimension Fortress Macross II: Lover...,1992-05-21,Japanese,Japan,"Science Fiction, Japanese Movies, Adventure, A...","The story takes place in the year 2092,The Sup..."


In [72]:
import pandas as pd
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, PointStruct
from tqdm import tqdm

df = df.set_index('movie_id')

def clean_text(text):
    # Add any necessary text cleaning steps here
    return text.lower().strip()

# Clean the summaries
df['clean_summary'] = df['summaries'].apply(clean_text)


In [75]:
def extract_year(date_str):
    if pd.isna(date_str):
        return pd.NaT
    
    # If it's already a year (4-digit number)
    if isinstance(date_str, str) and date_str.isdigit() and len(date_str) == 4:
        return int(date_str)
    
    # Try to parse as a date
    try:
        return pd.to_datetime(date_str).year
    except:
        return pd.NaT

# Apply the function to the DataFrame
df['release_year'] = df['release_date'].apply(extract_year)

In [81]:
# Create the text field
df['text'] = df.apply(lambda row: 
    f"movie name: {row['movie_name']}, " +
    (f"release year: {row['release_year']}, " if pd.notna(row['release_year']) else "") +
    f"movie language: {row['language']}, " +
    f"movie country: {row['country']}, " +
    f"movie genres: ({row['genres']}), " +
    f"movie summary: {row['clean_summary']}", 
    axis=1
)
df

Unnamed: 0_level_0,movie_name,release_date,language,country,genres,summaries,clean_summary,release_year,text
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
9363483,White Of The Eye,1987,English,United Kingdom,"Thriller, Erotic thriller, Psychological thriller",A series of murders of rich young women throug...,a series of murders of rich young women throug...,1987,"movie name: White Of The Eye, release year: 19..."
261236,A Woman in Flames,1983,German,Germany,Drama,"Eva, an upper class housewife, becomes frustra...","eva, an upper class housewife, becomes frustra...",1983,"movie name: A Woman in Flames, release year: 1..."
18998739,The Sorcerer's Apprentice,2002,English,South Africa,"Family Film, Fantasy, Adventure, World cinema","Every hundred years, the evil Morgana returns...","every hundred years, the evil morgana returns...",2002,"movie name: The Sorcerer's Apprentice, release..."
6631279,Little city,1997-04-04,English,United States of America,"Romantic comedy, Ensemble Film, Comedy-drama, ...","Adam, a San Francisco-based artist who works a...","adam, a san francisco-based artist who works a...",1997,"movie name: Little city, release year: 1997, m..."
171005,Henry V,1989-11-08,English,United Kingdom,"Costume drama, War film, Epic, Period piece, D...",{{Plot|dateAct 1Act 2Act 3Act 4Act 5 Finally n...,{{plot|dateact 1act 2act 3act 4act 5 finally n...,1989,"movie name: Henry V, release year: 1989, movie..."
...,...,...,...,...,...,...,...,...,...
23851782,The Ghost Train,1941-05-03,English,United Kingdom,"Crime Fiction, Thriller, Comedy, Supernatural",{{plot}} The film opens with a Great Western e...,{{plot}} the film opens with a great western e...,1941,"movie name: The Ghost Train, release year: 194..."
35228177,Mermaids: The Body Found,2011-03-19,English,United States of America,Drama,Two former National Oceanic Atmospheric Admini...,two former national oceanic atmospheric admini...,2011,"movie name: Mermaids: The Body Found, release ..."
34980460,Knuckle,2011-01-21,English,"Ireland, United Kingdom","Biographical film, Drama, Documentary",{{No plot}} This film follows 12 years in the ...,{{no plot}} this film follows 12 years in the ...,2011,"movie name: Knuckle, release year: 2011, movie..."
913762,The Super Dimension Fortress Macross II: Lover...,1992-05-21,Japanese,Japan,"Science Fiction, Japanese Movies, Adventure, A...","The story takes place in the year 2092,The Sup...","the story takes place in the year 2092,the sup...",1992,movie name: The Super Dimension Fortress Macro...


In [83]:
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
import pandas as pd

# Load the model
encoder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def generate_embedding(text):
    return encoder.encode(text)

# Enable tqdm for pandas
tqdm.pandas()

print("Generating embeddings...")

df['embedding'] = df['text'].head(1000).progress_apply(generate_embedding)

print("Embeddings generation complete.")

Generating embeddings...


100%|██████████| 1000/1000 [00:36<00:00, 27.77it/s]

Embeddings generation complete.





In [84]:
df = df.dropna()

In [None]:
# Initialize Qdrant client
host="localhost", port=6333
client = QdrantClient(host=host, port=port)

In [87]:
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer

def create_and_upload_to_qdrant(df, client, collection_name="movies", encoder_model="all-MiniLM-L6-v2", batch_size=100):
    
    
    # Initialize SentenceTransformer
    encoder = SentenceTransformer(encoder_model)
    
    # Create collection
    vector_size = len(encoder.encode("Sample text"))
    client.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(size=vector_size, distance=models.Distance.COSINE)
    )
    
    # Prepare data
    input_data = df[['movie_name', 'release_year', 'genres', 'language', 'country', 'text']].to_dict(orient='records')
    
    # Upload points in batches
    for i in range(0, len(input_data), batch_size):
        batch = input_data[i:i+batch_size]
        
        points = [
            models.PointStruct(
                id=idx + i,
                vector=encoder.encode(doc["text"]).tolist(),
                payload=doc
            )
            for idx, doc in enumerate(batch)
        ]
        
        client.upsert(
            collection_name=collection_name,
            points=points
        )
        
        print(f"Uploaded batch {i//batch_size + 1}/{(len(input_data)-1)//batch_size + 1}")
    
    print(f"Upload complete. Total points: {len(input_data)}")

create_and_upload_to_qdrant(df.head(200))

Uploaded batch 1/2
Uploaded batch 2/2
Upload complete. Total points: 200


In [99]:
def semantic_search(query, client, top_k=5):
    query_vector = generate_embedding(query)
    results = client.search(
        collection_name="movies",
        query_vector=query_vector,
        limit=top_k
    )
    return [
        {
            'movie_name': result.payload['movie_name'],
            'release_year': result.payload['release_year'],
            'genres': result.payload['genres'],
            'summary': result.payload['text'].split('movie summary: ')[1],
            'score': result.score
        } for result in results
    ]

In [105]:
semantic_search('tradition', client)

[{'movie_name': 'Tradition is a Temple',
  'release_year': 2011,
  'genres': 'Documentary',
  'summary': 'contemporary new orleans jazz musicians discuss their childhood introductions to music in baptist churches and through local traditions like second line and jazz funerals, and the role of danny barker in keeping traditional new orleans jazz alive through the 70’s and 80’s. asking the artists point blank, director darren hoffman explores the potential “death” of traditional jazz through modernization and marginalization and its preservation through mentorship and the continuation of traditions that intrigue and inspire young people to play the music of previous generations.',
  'score': 0.3117259},
 {'movie_name': 'Rudo y Cursi',
  'release_year': 2008,
  'genres': 'Sports, Drama, Family Drama, Comedy-drama, Comedy',
  'summary': "in the fictional farming village of tlachtlán, in the cihuatlán valley of jalisco, mexico, young men dream of escaping the drudgery of the banana plantati