In [43]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import pickle

In [45]:
actors = pd.read_csv('archive//actors.csv')
actors

Unnamed: 0,id,name,role
0,1000001,Margot Robbie,Barbie
1,1000001,Ryan Gosling,Ken
2,1000001,America Ferrera,Gloria
3,1000001,Ariana Greenblatt,Sasha
4,1000001,Issa Rae,Barbie
...,...,...,...
5798445,1941596,Marc Ma,Ba Cai/巴莱
5798446,1941596,线雨轩,Tata/塔塔
5798447,1941596,Jiang Yixuan,Zuo Yila（Zoila）/佐伊拉
5798448,1941597,Hiroshi Mikami,


In [46]:
actors = actors.groupby('id')['name'].agg(
    lambda x: '|'.join(
        set(', '.join(x.dropna().astype(str)).split(', '))
    )
).reset_index()
actors.rename(columns={'name': 'actors'}, inplace=True)
actors

Unnamed: 0,id,actors
0,1000001,Alex Sturman|Sia Dauda|George Basil|Cristian L...
1,1000002,Jung Ji-so|Jeong Ik-han|Kim Geon|Jeon Eun-mi|S...
2,1000003,Anthony Molinari|Daniel Scheinert|Tallie Medel...
3,1000004,Christie Cronenweth|Robby Robinson|Bennie Moor...
4,1000005,Morgan Larson|Emma Stone|Brandon O'Neal|Tommy ...
...,...,...
634297,1941559,黑灯
634298,1941563,Pan Binlong|黄子弘凡|Li Jian|Zhang Youhao|Lan Xiya...
634299,1941589,王军锋
634300,1941596,线雨轩|Marc Ma|Sandrine Pinna|Jiang Yixuan|Ethan ...


In [47]:
countries = pd.read_csv('archive//countries.csv')
countries

Unnamed: 0,id,country
0,1000001,UK
1,1000001,USA
2,1000002,South Korea
3,1000003,USA
4,1000004,Germany
...,...,...
693471,1941593,China
693472,1941594,USA
693473,1941595,USA
693474,1941596,China


In [48]:
countries = countries.groupby('id')['country'].agg(
    lambda x: '|'.join(
        set(', '.join(x.dropna().astype(str)).split(', '))
    )
).reset_index()
countries.rename(columns={'country': 'countries'}, inplace=True)
countries

Unnamed: 0,id,countries
0,1000001,UK|USA
1,1000002,South Korea
2,1000003,USA
3,1000004,Germany|USA
4,1000005,Hong Kong|USA
...,...,...
617187,1941593,China
617188,1941594,USA
617189,1941595,USA
617190,1941596,China


In [49]:
crew = pd.read_csv('archive//crew.csv')
crew

Unnamed: 0,id,role,name
0,1000001,Director,Greta Gerwig
1,1000001,Producer,Tom Ackerley
2,1000001,Producer,Margot Robbie
3,1000001,Producer,Robbie Brenner
4,1000001,Producer,David Heyman
...,...,...,...
4720178,1941596,Casting,线雨轩
4720179,1941596,Editor,Eric Kwong Chi-Leung
4720180,1941596,Cinematography,Kenny Tse
4720181,1941596,Composer,胡小欧


In [50]:
crew = (
    crew.groupby(['id', 'role'])['name']
    .agg(lambda x: '|'.join(x.dropna().astype(str)))
    .unstack()
    .reset_index()
)
crew

role,id,Additional directing,Additional photography,Art direction,Assistant director,Camera operator,Casting,Choreography,Cinematography,Co-director,...,Production design,Set decoration,Songs,Sound,Special effects,Story,Stunts,Title design,Visual effects,Writer
0,1000001,George Cottle|John Sorapure,John Sorapure,David Doran|Jordana Finkel|Clara Gomez del Mor...,David Keadell|Danni Lizaitis|Josh Robertson|Ma...,Chris Bain|Simon Finney,Lucy Bevan|Allison Jones|Olivia Grant|Emily Bu...,Jennifer White,Rodrigo Prieto,,...,Sarah Greenwood,Katie Spencer|Ashley Swanson|Trixie Gardner|An...,Dua Lipa|Charli XCX|Billie Eilish|Finneas O'Co...,Robert Sharman|Chelsea Body|Dan Kenyon|Ai-Ling...,Lucy Thompson|Alex Robinson|Peter Treece|Sam P...,,Ingrid Kleinig|Roy Taylor|Donny Bailey|Hannah ...,,Nick Irving Allen|François Dumoulin|Erin Hewit...,Noah Baumbach|Greta Gerwig
1,1000002,Lee Jung-hoon,Lee Hyun,Mo So-ra,Kim Seong-sik|Yoon Young-woo,Yang Hyeon-seok,,,Hong Kyung-pyo,,...,Lee Ha-jun,Cho Won-woo|Noh Seung-goog|Song Suk-ki,Choi Woo-shik|Park Hyo-shin|Lee Ji-hye,Choi Tae-young|Eun Hee-soo|Shin i Na|Kim Byung...,,Bong Joon-ho,Yoo Sang-seob|Oh Se-yeong|Yoo Mi-jin|Kwon Ji-h...,,Hong Jeong-ho|Chan-jin Chris Kim|Jang Mi-jin|K...,Kim Dae-hwan|Bong Joon-ho|Han Jin-won
2,1000003,,Kanamé Onoyama,Amelia Brooke,Rod Smith|John Nasraway,Aaron M. Smith|Alex Kornreich|Mario Contini|Ar...,Sarah Halley Finn|Djinous Rowling,,Larkin Seiple,,...,Jason Kisvarday,Kelsi Ephraim|Lia Burton|Hunter Gorton|Sierra ...,Mitski|David Byrne,Jason Charbonneau|Ian Chase|Alexandra Fehrman|...,Charlie Bruchez|Jonathan Kombrinck,,Narayana Cabral|Elisabeth P. Carpenter|Timothy...,,Evan Halleck|Zak Stoltz|Jeff Desom|Ethan Feldb...,Daniel Kwan|Daniel Scheinert
3,1000004,,Steve Wolfe,Chris Gorak|Seth Reed|Josue Clotaire Fleurimond,Bob Wagner|Mike Topoozian,Conrad W. Hall|Chris Haarhoff,Laray Mayfield|Karen Meisels,,Jeff Cronenweth,,...,Alex McDowell,Jay Hart|Luis G. Hoyos|Hugo Santiago|Peter J. ...,,Ren Klyce|Richard Hymns|Steve Boeddeker|Jeff W...,Cliff Wenger|Wayne Burnes,,Michael Runyard|Jeff Imada|Richard Cetrone|Dav...,,Carlos Saldanha|Joshua I. Kolden|Andrea D'Amic...,Jim Uhls|Andrew Kevin Walker
4,1000005,,,Austin Gorg|Steven Light-Orr,Paula Case|Peter Kohn,Ari Robbins|Davon Slininger,Deborah Aquila|Tricia Wood,Mandy Moore,Linus Sandgren,,...,David Wasco,Sandy Reynolds-Wasco|Jennie Harris|Daniel Brad...,Benj Pasek|Justin Paul|Justin Hurwitz|Damien C...,Kevin Becker|James Ashwill|Blake Collins|John ...,Jeremy Hays,,Nicolas Bosc|Brandon Cornell|Olivia Courtney|D...,Michael Riley,Chris LeDoux|Tim LeDoux|Jason Sanford,Damien Chazelle
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
807057,1941552,,,,,,,,,,...,,,,,,,,,,
807058,1941560,,,,,,,,,,...,,,,,,,,,,
807059,1941563,,,,,,,,,,...,,,,,,,,,,
807060,1941589,,,,,,,,,,...,,,,,,,,,,


In [51]:
genres = pd.read_csv('archive//genres.csv')
genres

Unnamed: 0,id,genre
0,1000001,Comedy
1,1000001,Adventure
2,1000002,Comedy
3,1000002,Thriller
4,1000002,Drama
...,...,...
1046844,1941563,Drama
1046845,1941566,Crime
1046846,1941569,Crime
1046847,1941596,Action


In [52]:
genres = genres.groupby('id')['genre'].agg(
    lambda x: '|'.join(
        set(', '.join(x.dropna().astype(str)).split(', '))
    )
).reset_index()
genres.rename(columns={'genre': 'genres'}, inplace=True)
genres

Unnamed: 0,id,genres
0,1000001,Comedy|Adventure
1,1000002,Comedy|Drama|Thriller
2,1000003,Comedy|Action|Science Fiction|Adventure
3,1000004,Drama
4,1000005,Comedy|Drama|Romance|Music
...,...,...
676486,1941559,Comedy
676487,1941563,Drama
676488,1941566,Crime
676489,1941569,Crime


In [53]:
languages = pd.read_csv('archive//languages.csv')
languages

Unnamed: 0,id,type,language
0,1000001,Language,English
1,1000002,Primary language,Korean
2,1000002,Spoken language,English
3,1000002,Spoken language,German
4,1000002,Spoken language,Korean
...,...,...,...
1038757,1941593,Language,Chinese
1038758,1941594,Language,English
1038759,1941595,Language,English
1038760,1941596,Language,Chinese


In [54]:
languages = (
    languages.groupby(['id', 'type'])['language']
    .agg(lambda x: '|'.join(x.dropna().astype(str)))
    .unstack()
    .reset_index()
)
languages

type,id,Language,Primary language,Spoken language
0,1000001,English,,
1,1000002,,Korean,English|German|Korean
2,1000003,,English,Cantonese|Chinese|English
3,1000004,English,,
4,1000005,English,,
...,...,...,...,...
905573,1941593,Chinese,,
905574,1941594,English,,
905575,1941595,English,,
905576,1941596,Chinese,,


In [55]:
movies = pd.read_csv('archive//movies.csv')
movies

Unnamed: 0,id,name,date,tagline,description,minute,rating
0,1000001,Barbie,2023.0,She's everything. He's just Ken.,Barbie and Ken are having the time of their li...,114.0,3.86
1,1000002,Parasite,2019.0,Act like you own the place.,"All unemployed, Ki-taek's family takes peculia...",133.0,4.56
2,1000003,Everything Everywhere All at Once,2022.0,The universe is so much bigger than you realize.,An aging Chinese immigrant is swept up in an i...,140.0,4.30
3,1000004,Fight Club,1999.0,Mischief. Mayhem. Soap.,A ticking-time-bomb insomniac and a slippery s...,139.0,4.27
4,1000005,La La Land,2016.0,Here's to the fools who dream.,"Mia, an aspiring actress, serves lattes to mov...",129.0,4.09
...,...,...,...,...,...,...,...
941592,1941593,神笛,,,,,
941593,1941594,蟲極道蜜団子抗争編 壱ノ巻,,,Shinjuku forest at night. In the sap taverns o...,30.0,
941594,1941595,蟲極道蜜団子抗争編 弐ノ巻,,,"The city that never sleeps, where insects gath...",30.0,
941595,1941596,重生,,,"In a world where order has broken down, darkne...",,


In [65]:
posters = pd.read_csv('archive//posters.csv')
posters.rename(columns={'link': 'poster'}, inplace=True)
posters

Unnamed: 0,id,poster
0,1000001,https://a.ltrbxd.com/resized/film-poster/2/7/7...
1,1000002,https://a.ltrbxd.com/resized/film-poster/4/2/6...
2,1000003,https://a.ltrbxd.com/resized/film-poster/4/7/4...
3,1000004,https://a.ltrbxd.com/resized/film-poster/5/1/5...
4,1000005,https://a.ltrbxd.com/resized/film-poster/2/4/0...
...,...,...
941592,1941593,
941593,1941594,
941594,1941595,https://a.ltrbxd.com/resized/film-poster/1/1/8...
941595,1941596,https://a.ltrbxd.com/resized/film-poster/1/1/8...


In [57]:
releases = pd.read_csv('archive//releases.csv')
releases

Unnamed: 0,id,country,date,type,rating
0,1000001,Andorra,2023-07-21,Theatrical,
1,1000001,Argentina,2023-07-20,Theatrical,ATP
2,1000001,Australia,2023-07-19,Theatrical,PG
3,1000001,Australia,2023-10-01,Digital,PG
4,1000001,Austria,2023-07-20,Theatrical,
...,...,...,...,...,...
1332777,1940967,USA,1909-01-01,Theatrical,
1332778,1940968,Sweden,1908-11-11,Theatrical,
1332779,1940969,France,1902-01-01,Theatrical,
1332780,1940970,France,1902-01-01,Theatrical,


In [58]:
studios = pd.read_csv('archive//studios.csv')
studios

Unnamed: 0,id,studio
0,1000001,LuckyChap Entertainment
1,1000001,Heyday Films
2,1000001,NB/GG Pictures
3,1000001,Mattel
4,1000001,Warner Bros. Pictures
...,...,...
679278,1941596,上海猫眼影业有限公司
679279,1941596,坏小子（北京）传媒有限公司
679280,1941596,亚太国影（重庆）文化传媒有限公司
679281,1941596,凤凰传奇影业有限公司


In [59]:
studios = studios.groupby('id')['studio'].agg(
    lambda x: '|'.join(
        set(', '.join(x.dropna().astype(str)).split(', '))
    )
).reset_index()
studios.rename(columns={'studio': 'studios'}, inplace=True)
studios

Unnamed: 0,id,studios
0,1000001,NB/GG Pictures|Warner Bros. Pictures|Heyday Fi...
1,1000002,Barunson E&A
2,1000003,Year of the Rat|IAC Films|Ley Line Entertainme...
3,1000004,Regency Enterprises|20th Century Fox|Taurus Fi...
4,1000005,Summit Entertainment|Gilbert Films|Marc Platt ...
...,...,...
438192,1941538,Helsinki-filmi
438193,1941539,Onza Entertainment
438194,1941541,Enlight Media
438195,1941557,滚石


In [60]:
themes = pd.read_csv('archive//themes.csv')
themes

Unnamed: 0,id,theme
0,1000001,Humanity and the world around us
1,1000001,Crude humor and satire
2,1000001,Moving relationship stories
3,1000001,Emotional and captivating fantasy storytelling
4,1000001,Surreal and thought-provoking visions of life ...
...,...,...
125636,1835643,Noir and dark crime dramas
125637,1835643,Intriguing and suspenseful murder mysteries
125638,1849827,Faith and religion
125639,1849827,Faith and spiritual journeys


In [61]:
themes = themes.groupby('id')['theme'].agg(
    lambda x: '|'.join(
        set(', '.join(x.dropna().astype(str)).split(', '))
    )
).reset_index()
themes.rename(columns={'theme': 'themes'}, inplace=True)
themes

Unnamed: 0,id,themes
0,1000001,Quirky and endearing relationships|Laugh-out-l...
1,1000002,Intense violence and sexual transgression|Endu...
2,1000003,Quirky and endearing relationships|Humanity an...
3,1000004,Intense violence and sexual transgression|Chal...
4,1000005,Captivating relationships and charming romance...
...,...,...
24503,1762425,Erotic relationships and desire|Song and dance
24504,1819348,Intense violence and sexual transgression|Twis...
24505,1826155,and slasher horror|gruesome|and terrifying hor...
24506,1835643,Intriguing and suspenseful murder mysteries|Tw...


In [66]:
movie = pd.merge(movies, genres, on = 'id')
movie = pd.merge(movie, countries, on = 'id')
movie = pd.merge(movie, languages, on = 'id')
movie = pd.merge(movie, actors, on = 'id')
movie = pd.merge(movie, crew, on = 'id')
movie = pd.merge(movie, studios, on = 'id')
movie = pd.merge(movie, themes, on = 'id')
movie = pd.merge(movie, posters, on = 'id')
movie

Unnamed: 0,id,name,date,tagline,description,minute,rating,genres,countries,Language,...,Sound,Special effects,Story,Stunts,Title design,Visual effects,Writer,studios,themes,poster
0,1000001,Barbie,2023.0,She's everything. He's just Ken.,Barbie and Ken are having the time of their li...,114.0,3.86,Comedy|Adventure,UK|USA,English,...,Robert Sharman|Chelsea Body|Dan Kenyon|Ai-Ling...,Lucy Thompson|Alex Robinson|Peter Treece|Sam P...,,Ingrid Kleinig|Roy Taylor|Donny Bailey|Hannah ...,,Nick Irving Allen|François Dumoulin|Erin Hewit...,Noah Baumbach|Greta Gerwig,NB/GG Pictures|Warner Bros. Pictures|Heyday Fi...,Quirky and endearing relationships|Laugh-out-l...,https://a.ltrbxd.com/resized/film-poster/2/7/7...
1,1000002,Parasite,2019.0,Act like you own the place.,"All unemployed, Ki-taek's family takes peculia...",133.0,4.56,Comedy|Drama|Thriller,South Korea,,...,Choi Tae-young|Eun Hee-soo|Shin i Na|Kim Byung...,,Bong Joon-ho,Yoo Sang-seob|Oh Se-yeong|Yoo Mi-jin|Kwon Ji-h...,,Hong Jeong-ho|Chan-jin Chris Kim|Jang Mi-jin|K...,Kim Dae-hwan|Bong Joon-ho|Han Jin-won,Barunson E&A,Intense violence and sexual transgression|Endu...,https://a.ltrbxd.com/resized/film-poster/4/2/6...
2,1000003,Everything Everywhere All at Once,2022.0,The universe is so much bigger than you realize.,An aging Chinese immigrant is swept up in an i...,140.0,4.30,Comedy|Action|Science Fiction|Adventure,USA,,...,Jason Charbonneau|Ian Chase|Alexandra Fehrman|...,Charlie Bruchez|Jonathan Kombrinck,,Narayana Cabral|Elisabeth P. Carpenter|Timothy...,,Evan Halleck|Zak Stoltz|Jeff Desom|Ethan Feldb...,Daniel Kwan|Daniel Scheinert,Year of the Rat|IAC Films|Ley Line Entertainme...,Quirky and endearing relationships|Humanity an...,https://a.ltrbxd.com/resized/film-poster/4/7/4...
3,1000004,Fight Club,1999.0,Mischief. Mayhem. Soap.,A ticking-time-bomb insomniac and a slippery s...,139.0,4.27,Drama,Germany|USA,English,...,Ren Klyce|Richard Hymns|Steve Boeddeker|Jeff W...,Cliff Wenger|Wayne Burnes,,Michael Runyard|Jeff Imada|Richard Cetrone|Dav...,,Carlos Saldanha|Joshua I. Kolden|Andrea D'Amic...,Jim Uhls|Andrew Kevin Walker,Regency Enterprises|20th Century Fox|Taurus Fi...,Intense violence and sexual transgression|Chal...,https://a.ltrbxd.com/resized/film-poster/5/1/5...
4,1000005,La La Land,2016.0,Here's to the fools who dream.,"Mia, an aspiring actress, serves lattes to mov...",129.0,4.09,Comedy|Drama|Romance|Music,Hong Kong|USA,English,...,Kevin Becker|James Ashwill|Blake Collins|John ...,Jeremy Hays,,Nicolas Bosc|Brandon Cornell|Olivia Courtney|D...,Michael Riley,Chris LeDoux|Tim LeDoux|Jason Sanford,Damien Chazelle,Summit Entertainment|Gilbert Films|Marc Platt ...,Captivating relationships and charming romance...,https://a.ltrbxd.com/resized/film-poster/2/4/0...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23259,1628265,Coldd Lassi Aur Chicken Masala,2019.0,,"Fate brings ex-lovers Nitya, Indie Spice's hea...",336.0,,Drama,India,Hindi,...,,,,,,,Doris Dey|Jaya Mishra,Linga Bhairavi Devi Productions,Captivating relationships and charming romance...,https://a.ltrbxd.com/resized/film-poster/9/8/2...
23260,1659939,Marriage,2022.0,,Marriage sees married couple Ian and Emma nego...,240.0,,Drama,UK,English,...,Wayne Brooks|Nigel Heath|Richard Munns|Stuart ...,,,,,,,The Forge Entertainment|All3Media|The Money Men,Moving relationship stories|Emotional and touc...,https://a.ltrbxd.com/resized/film-poster/1/1/8...
23261,1664306,Fixerr,2019.0,,ATS Officer Jayveer Malik’s stands suspended a...,324.0,,Drama,India,Hindi,...,,,,,,,,Balaji Telefilms,High speed and special ops|Intense violence an...,https://a.ltrbxd.com/resized/film-poster/9/8/2...
23262,1762425,Pretty Things,2005.0,,A look into the world of 20th century burlesqu...,90.0,,Documentary,USA,English,...,,,,,,,Liz Goldwyn,HBO,Erotic relationships and desire|Song and dance,


In [67]:
movie.columns

Index(['id', 'name', 'date', 'tagline', 'description', 'minute', 'rating',
       'genres', 'countries', 'Language', 'Primary language',
       'Spoken language', 'actors', 'Additional directing',
       'Additional photography', 'Art direction', 'Assistant director',
       'Camera operator', 'Casting', 'Choreography', 'Cinematography',
       'Co-director', 'Composer', 'Costume design', 'Director', 'Editor',
       'Executive producer', 'Hairstyling', 'Lighting', 'Makeup',
       'Original writer', 'Producer', 'Production design', 'Set decoration',
       'Songs', 'Sound', 'Special effects', 'Story', 'Stunts', 'Title design',
       'Visual effects', 'Writer', 'studios', 'themes', 'poster'],
      dtype='object')

In [68]:
cols_to_combine = [
    'tagline', 'description', 'genres', 'countries', 'Language', 'Primary language', 'Spoken language',
    'actors', 'Additional directing', 'Additional photography', 'Art direction', 'Assistant director',
    'Camera operator', 'Casting', 'Choreography', 'Cinematography', 'Co-director', 'Composer',
    'Costume design', 'Director', 'Editor', 'Executive producer', 'Hairstyling', 'Lighting', 'Makeup',
    'Original writer', 'Producer', 'Production design', 'Set decoration', 'Songs', 'Sound',
    'Special effects', 'Story', 'Stunts', 'Title design', 'Visual effects', 'Writer', 'studios', 'themes'
]

movie['combined'] = movie[cols_to_combine].fillna('').agg('|'.join, axis=1)

In [85]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(movie['combined'])

In [87]:
top_similarities = {}

for idx in range(tfidf_matrix.shape[0]):
    cosine_sim = linear_kernel(tfidf_matrix[idx], tfidf_matrix).flatten()
    
    # Get top 10 excluding itself
    top_indices = cosine_sim.argsort()[-11:-1][::-1]
    
    # Store as (movieId, similarity) tuples
    top_similarities[movie.iloc[idx]['movieId']] = [
        (movie.iloc[i]['movieId'], round(cosine_sim[i], 4)) for i in top_indices
    ]

In [97]:
similar_df = pd.DataFrame([
    {'movieId': key, 'similar_movies': value}
    for key, value in top_similarities.items()
])

In [98]:
with open("similar_movies.pkl", "wb") as f:
    pickle.dump(similar_df, f)

In [99]:
with open('movie.pkl', 'wb') as f:
    pickle.dump(movie, f)