The data is right now saved as a mongo db objects. If we want to use it we must parse it to json.

In [75]:
import json
import re
    
def to_json(string: str):
    # Load the data as a string and parse it to json.

    # Find all JSON-like structures in the string
    matches = re.findall(r"(\{.*?\})", string, re.DOTALL)
    result = []

    for match in matches:
        try:
            # Remove the ObjectId(*)
            movie = re.sub(r'ObjectId\("([^"]*)"\)', r'"\1"', match)
            movie_obj = json.loads(movie)
            result.append(movie_obj)
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON: {e}")
            continue
    
    return result

In [76]:
import os
import pandas as pd

folder = './data/Movie_Poster_Metadata'
filenames = os.listdir(folder)
movies = pd.DataFrame()

for filename in filenames:
    filepath = f"{folder}/{filename}"

    try:
        with open(file=filepath, mode='r', encoding='utf-16') as file:
            content = file.read() 
            data = to_json(content)
    except:
        with open(file=filepath, mode='r', encoding='utf-8') as file:
            content = file.read() 
            data = to_json(content)

    df = pd.DataFrame(data)
    movies = pd.concat([movies, pd.DataFrame(data)])
movies    

Unnamed: 0,_id,Language,Box_office,Country,Rated,imdbID,Awards,Poster,Director,Released,Writer,imdbVotes,Runtime,Response,imdbRating,Title,Genre,Actors,Type,Metascore
0,56fb860e71a0651b0c306f55,English,290158751,USA,PG,tt0080684,Won 1 Oscar. Another 15 wins & 18 nominations.,http://ia.media-imdb.com/images/M/MV5BMjE2MzQw...,Irvin Kershner,20 Jun 1980,"Leigh Brackett (screenplay), Lawrence Kasdan (...",799579,124 min,True,8.8,Star Wars: Episode V - The Empire Strikes Back,"Action, Adventure, Fantasy","Mark Hamill, Harrison Ford, Carrie Fisher, Bil...",movie,79
1,56fb860e71a0651b0c306f56,English,101500000,USA,R,tt0081562,1 nomination.,http://ia.media-imdb.com/images/M/MV5BMTM4OTI3...,Sidney Poitier,12 Dec 1980,Bruce Jay Friedman,17705,111 min,True,6.8,Stir Crazy,"Comedy, Crime","Gene Wilder, Richard Pryor, Georg Stanford Bro...",movie,50
2,56fb860e71a0651b0c306f57,English,83400000,USA,PG,tt0080339,Nominated for 1 Golden Globe. Another 2 wins &...,http://ia.media-imdb.com/images/M/MV5BNDU2MjE4...,"Jim Abrahams, David Zucker, Jerry Zucker",02 Jul 1980,"Jim Abrahams (written for the screen by), Davi...",154994,88 min,True,7.8,Airplane!,Comedy,"Kareem Abdul-Jabbar, Lloyd Bridges, Peter Grav...",movie,
3,56fb860e71a0651b0c306f58,English,,USA,PG,tt0080377,1 win.,http://ia.media-imdb.com/images/M/MV5BMTc5MTc5...,Buddy Van Horn,17 Dec 1980,"Jeremy Joe Kronsberg (characters), Stanford Sh...",12639,116 min,True,6.0,Any Which Way You Can,"Action, Comedy","Clint Eastwood, Sondra Locke, Geoffrey Lewis, ...",movie,
4,56fb860e71a0651b0c306f59,English,69800000,USA,R,tt0081375,Nominated for 3 Oscars. Another 1 win & 3 nomi...,http://ia.media-imdb.com/images/M/MV5BMjIyNjkx...,Howard Zieff,10 Oct 1980,"Nancy Meyers, Charles Shyer, Harvey Miller",17591,109 min,True,6.1,Private Benjamin,"Comedy, War","Goldie Hawn, Eileen Brennan, Armand Assante, R...",movie,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,570780c771a0650b20736c69,English,1711,USA,,tt2912776,,http://ia.media-imdb.com/images/M/MV5BMTk4ODE4...,John D. Hancock,27 Aug 2015,"Dorothy Tristan, Dorothy Tristan",9,110 min,True,7.2,The Looking Glass,"Drama, Music","Dorothy Tristan, Trish Basinger, Ed Ernstes, E...",movie,
391,570780c771a0650b20736c6a,English,1631,Australia,UNRATED,tt2241750,8 wins & 3 nominations.,http://ia.media-imdb.com/images/M/MV5BMTcwNzc2...,Dean Francis,04 Mar 2015,"Stephen Davis, Dean Francis, Stephen Davis (play)",547,93 min,True,5.7,Drown,"Drama, Sport","Matt Levett, Maya Stange, Jack Matthews, Harry...",movie,
392,570780c771a0650b20736c6b,English,1056,USA,,tt4797082,,,Muffie Meyer,30 Oct 2015,,,64 min,True,,Making Rounds,Documentary,,movie,
393,570780c771a0650b20736c6c,English,,USA,,tt5350540,,,,25 Aug 2015,,,8 min,True,,"Skin Trade: Behind the Action, Fighting You Ca...","Documentary, Short","Craig Baumgarten, Tony Jaa, Dolph Lundgren, Ro...",movie,


Now we have all the metdata but not a link between the photos and the metadata. To create a connection between the metdata and the photos i created a dataframe for the photos and merged them.

In [77]:
from PIL import Image

photos = pd.DataFrame({
    'imdbID': [],
    'filename': [],
    'path': [],
    'extension': [],
    'width': [],
    'height': [],
    'file_size_kb': []
})

for folder in os.listdir('./data/Movie_Poster_Dataset'):

    for filename in os.listdir(f"./data/Movie_Poster_Dataset/{folder}"):
        full_path = f"./data/Movie_Poster_Dataset/{folder}/{filename}"

        img = Image.open(full_path)
        width, height = img.size
        
        # Get file size
        file_size = os.path.getsize(full_path) / 1024 


        photos = pd.concat([photos, pd.DataFrame({
            "imdbID": [filename.split('.')[0]], 
            "filename": [filename],
            "path": [f"Movie_Poster_Dataset/{folder}/{filename}"],
            "extension": [filename.split('.')[-1]],
            'width': [width],
            'height': [height],
            'file_size_kb': [file_size],
            'aspect_ratio': [(width / height
            )]
        })])  


dataset = pd.merge(left=movies, right=photos, left_on="imdbID", right_on="imdbID")
dataset = dataset.rename(columns=lambda x: x.lower())
dataset

Unnamed: 0,_id,language,box_office,country,rated,imdbid,awards,poster,director,released,...,actors,type,metascore,filename,path,extension,width,height,file_size_kb,aspect_ratio
0,56fb860e71a0651b0c306f55,English,290158751,USA,PG,tt0080684,Won 1 Oscar. Another 15 wins & 18 nominations.,http://ia.media-imdb.com/images/M/MV5BMjE2MzQw...,Irvin Kershner,20 Jun 1980,...,"Mark Hamill, Harrison Ford, Carrie Fisher, Bil...",movie,79,tt0080684.jpg,Movie_Poster_Dataset/1980/tt0080684.jpg,jpg,300.0,444.0,32.535156,0.675676
1,56fb860e71a0651b0c306f56,English,101500000,USA,R,tt0081562,1 nomination.,http://ia.media-imdb.com/images/M/MV5BMTM4OTI3...,Sidney Poitier,12 Dec 1980,...,"Gene Wilder, Richard Pryor, Georg Stanford Bro...",movie,50,tt0081562.jpg,Movie_Poster_Dataset/1980/tt0081562.jpg,jpg,300.0,438.0,42.684570,0.684932
2,56fb860e71a0651b0c306f57,English,83400000,USA,PG,tt0080339,Nominated for 1 Golden Globe. Another 2 wins &...,http://ia.media-imdb.com/images/M/MV5BNDU2MjE4...,"Jim Abrahams, David Zucker, Jerry Zucker",02 Jul 1980,...,"Kareem Abdul-Jabbar, Lloyd Bridges, Peter Grav...",movie,,tt0080339.jpg,Movie_Poster_Dataset/1980/tt0080339.jpg,jpg,300.0,444.0,30.916016,0.675676
3,56fb860e71a0651b0c306f58,English,,USA,PG,tt0080377,1 win.,http://ia.media-imdb.com/images/M/MV5BMTc5MTc5...,Buddy Van Horn,17 Dec 1980,...,"Clint Eastwood, Sondra Locke, Geoffrey Lewis, ...",movie,,tt0080377.jpg,Movie_Poster_Dataset/1980/tt0080377.jpg,jpg,300.0,432.0,42.120117,0.694444
4,56fb860e71a0651b0c306f59,English,69800000,USA,R,tt0081375,Nominated for 3 Oscars. Another 1 win & 3 nomi...,http://ia.media-imdb.com/images/M/MV5BMjIyNjkx...,Howard Zieff,10 Oct 1980,...,"Goldie Hawn, Eileen Brennan, Armand Assante, R...",movie,,tt0081375.jpg,Movie_Poster_Dataset/1980/tt0081375.jpg,jpg,300.0,444.0,30.539062,0.675676
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8059,570780c771a0650b20736c65,French,2033,"France, Belgium",,tt4159182,,http://ia.media-imdb.com/images/M/MV5BMTY1Njc5...,Joann Sfar,18 Dec 2015,...,"Freya Mavor, Benjamin Biolay, Elio Germano, St...",movie,,tt4159182.jpg,Movie_Poster_Dataset/2015/tt4159182.jpg,jpg,300.0,443.0,30.335938,0.677201
8060,570780c771a0650b20736c66,English,,USA,,tt2265431,,http://ia.media-imdb.com/images/M/MV5BMTgzODUx...,Jaime Zevallos,07 Jul 2015,...,"Angela Sarafyan, Jaime Zevallos, Sean Nateghi,...",movie,,tt2265431.jpg,Movie_Poster_Dataset/2015/tt2265431.jpg,jpg,300.0,419.0,39.723633,0.715990
8061,570780c771a0650b20736c67,English,1903,USA,,tt3240784,,http://ia.media-imdb.com/images/M/MV5BMTY4NjUy...,"David T. Friendly, Mick Partridge",07 Aug 2015,...,,movie,50,tt3240784.jpg,Movie_Poster_Dataset/2015/tt3240784.jpg,jpg,300.0,442.0,41.201172,0.678733
8062,570780c771a0650b20736c69,English,1711,USA,,tt2912776,,http://ia.media-imdb.com/images/M/MV5BMTk4ODE4...,John D. Hancock,27 Aug 2015,...,"Dorothy Tristan, Trish Basinger, Ed Ernstes, E...",movie,,tt2912776.jpg,Movie_Poster_Dataset/2015/tt2912776.jpg,jpg,300.0,444.0,21.246094,0.675676


In [78]:
dataset

Unnamed: 0,_id,language,box_office,country,rated,imdbid,awards,poster,director,released,...,actors,type,metascore,filename,path,extension,width,height,file_size_kb,aspect_ratio
0,56fb860e71a0651b0c306f55,English,290158751,USA,PG,tt0080684,Won 1 Oscar. Another 15 wins & 18 nominations.,http://ia.media-imdb.com/images/M/MV5BMjE2MzQw...,Irvin Kershner,20 Jun 1980,...,"Mark Hamill, Harrison Ford, Carrie Fisher, Bil...",movie,79,tt0080684.jpg,Movie_Poster_Dataset/1980/tt0080684.jpg,jpg,300.0,444.0,32.535156,0.675676
1,56fb860e71a0651b0c306f56,English,101500000,USA,R,tt0081562,1 nomination.,http://ia.media-imdb.com/images/M/MV5BMTM4OTI3...,Sidney Poitier,12 Dec 1980,...,"Gene Wilder, Richard Pryor, Georg Stanford Bro...",movie,50,tt0081562.jpg,Movie_Poster_Dataset/1980/tt0081562.jpg,jpg,300.0,438.0,42.684570,0.684932
2,56fb860e71a0651b0c306f57,English,83400000,USA,PG,tt0080339,Nominated for 1 Golden Globe. Another 2 wins &...,http://ia.media-imdb.com/images/M/MV5BNDU2MjE4...,"Jim Abrahams, David Zucker, Jerry Zucker",02 Jul 1980,...,"Kareem Abdul-Jabbar, Lloyd Bridges, Peter Grav...",movie,,tt0080339.jpg,Movie_Poster_Dataset/1980/tt0080339.jpg,jpg,300.0,444.0,30.916016,0.675676
3,56fb860e71a0651b0c306f58,English,,USA,PG,tt0080377,1 win.,http://ia.media-imdb.com/images/M/MV5BMTc5MTc5...,Buddy Van Horn,17 Dec 1980,...,"Clint Eastwood, Sondra Locke, Geoffrey Lewis, ...",movie,,tt0080377.jpg,Movie_Poster_Dataset/1980/tt0080377.jpg,jpg,300.0,432.0,42.120117,0.694444
4,56fb860e71a0651b0c306f59,English,69800000,USA,R,tt0081375,Nominated for 3 Oscars. Another 1 win & 3 nomi...,http://ia.media-imdb.com/images/M/MV5BMjIyNjkx...,Howard Zieff,10 Oct 1980,...,"Goldie Hawn, Eileen Brennan, Armand Assante, R...",movie,,tt0081375.jpg,Movie_Poster_Dataset/1980/tt0081375.jpg,jpg,300.0,444.0,30.539062,0.675676
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8059,570780c771a0650b20736c65,French,2033,"France, Belgium",,tt4159182,,http://ia.media-imdb.com/images/M/MV5BMTY1Njc5...,Joann Sfar,18 Dec 2015,...,"Freya Mavor, Benjamin Biolay, Elio Germano, St...",movie,,tt4159182.jpg,Movie_Poster_Dataset/2015/tt4159182.jpg,jpg,300.0,443.0,30.335938,0.677201
8060,570780c771a0650b20736c66,English,,USA,,tt2265431,,http://ia.media-imdb.com/images/M/MV5BMTgzODUx...,Jaime Zevallos,07 Jul 2015,...,"Angela Sarafyan, Jaime Zevallos, Sean Nateghi,...",movie,,tt2265431.jpg,Movie_Poster_Dataset/2015/tt2265431.jpg,jpg,300.0,419.0,39.723633,0.715990
8061,570780c771a0650b20736c67,English,1903,USA,,tt3240784,,http://ia.media-imdb.com/images/M/MV5BMTY4NjUy...,"David T. Friendly, Mick Partridge",07 Aug 2015,...,,movie,50,tt3240784.jpg,Movie_Poster_Dataset/2015/tt3240784.jpg,jpg,300.0,442.0,41.201172,0.678733
8062,570780c771a0650b20736c69,English,1711,USA,,tt2912776,,http://ia.media-imdb.com/images/M/MV5BMTk4ODE4...,John D. Hancock,27 Aug 2015,...,"Dorothy Tristan, Trish Basinger, Ed Ernstes, E...",movie,,tt2912776.jpg,Movie_Poster_Dataset/2015/tt2912776.jpg,jpg,300.0,444.0,21.246094,0.675676


Year 1992 has metadata but no images so thats why the dataframe is different in size.

The dates are also not processable right now so im going to split the into different columns

In [79]:
for index, row in dataset.iterrows():
    date = row['released']

    if not date or date == 'N/A':
        continue

    
    datestring = date.split(' ')
    if len(datestring) == 3:
        day, month, year = datestring
        dataset.at[index, 'day'] = int(day.strip())
        dataset.at[index, 'month'] = month.strip()
        dataset.at[index, 'year'] = int(year.strip())

dataset

Unnamed: 0,_id,language,box_office,country,rated,imdbid,awards,poster,director,released,...,filename,path,extension,width,height,file_size_kb,aspect_ratio,day,month,year
0,56fb860e71a0651b0c306f55,English,290158751,USA,PG,tt0080684,Won 1 Oscar. Another 15 wins & 18 nominations.,http://ia.media-imdb.com/images/M/MV5BMjE2MzQw...,Irvin Kershner,20 Jun 1980,...,tt0080684.jpg,Movie_Poster_Dataset/1980/tt0080684.jpg,jpg,300.0,444.0,32.535156,0.675676,20.0,Jun,1980.0
1,56fb860e71a0651b0c306f56,English,101500000,USA,R,tt0081562,1 nomination.,http://ia.media-imdb.com/images/M/MV5BMTM4OTI3...,Sidney Poitier,12 Dec 1980,...,tt0081562.jpg,Movie_Poster_Dataset/1980/tt0081562.jpg,jpg,300.0,438.0,42.684570,0.684932,12.0,Dec,1980.0
2,56fb860e71a0651b0c306f57,English,83400000,USA,PG,tt0080339,Nominated for 1 Golden Globe. Another 2 wins &...,http://ia.media-imdb.com/images/M/MV5BNDU2MjE4...,"Jim Abrahams, David Zucker, Jerry Zucker",02 Jul 1980,...,tt0080339.jpg,Movie_Poster_Dataset/1980/tt0080339.jpg,jpg,300.0,444.0,30.916016,0.675676,2.0,Jul,1980.0
3,56fb860e71a0651b0c306f58,English,,USA,PG,tt0080377,1 win.,http://ia.media-imdb.com/images/M/MV5BMTc5MTc5...,Buddy Van Horn,17 Dec 1980,...,tt0080377.jpg,Movie_Poster_Dataset/1980/tt0080377.jpg,jpg,300.0,432.0,42.120117,0.694444,17.0,Dec,1980.0
4,56fb860e71a0651b0c306f59,English,69800000,USA,R,tt0081375,Nominated for 3 Oscars. Another 1 win & 3 nomi...,http://ia.media-imdb.com/images/M/MV5BMjIyNjkx...,Howard Zieff,10 Oct 1980,...,tt0081375.jpg,Movie_Poster_Dataset/1980/tt0081375.jpg,jpg,300.0,444.0,30.539062,0.675676,10.0,Oct,1980.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8059,570780c771a0650b20736c65,French,2033,"France, Belgium",,tt4159182,,http://ia.media-imdb.com/images/M/MV5BMTY1Njc5...,Joann Sfar,18 Dec 2015,...,tt4159182.jpg,Movie_Poster_Dataset/2015/tt4159182.jpg,jpg,300.0,443.0,30.335938,0.677201,18.0,Dec,2015.0
8060,570780c771a0650b20736c66,English,,USA,,tt2265431,,http://ia.media-imdb.com/images/M/MV5BMTgzODUx...,Jaime Zevallos,07 Jul 2015,...,tt2265431.jpg,Movie_Poster_Dataset/2015/tt2265431.jpg,jpg,300.0,419.0,39.723633,0.715990,7.0,Jul,2015.0
8061,570780c771a0650b20736c67,English,1903,USA,,tt3240784,,http://ia.media-imdb.com/images/M/MV5BMTY4NjUy...,"David T. Friendly, Mick Partridge",07 Aug 2015,...,tt3240784.jpg,Movie_Poster_Dataset/2015/tt3240784.jpg,jpg,300.0,442.0,41.201172,0.678733,7.0,Aug,2015.0
8062,570780c771a0650b20736c69,English,1711,USA,,tt2912776,,http://ia.media-imdb.com/images/M/MV5BMTk4ODE4...,John D. Hancock,27 Aug 2015,...,tt2912776.jpg,Movie_Poster_Dataset/2015/tt2912776.jpg,jpg,300.0,444.0,21.246094,0.675676,27.0,Aug,2015.0


In [80]:
print(dataset.columns.tolist())

['_id', 'language', 'box_office', 'country', 'rated', 'imdbid', 'awards', 'poster', 'director', 'released', 'writer', 'imdbvotes', 'runtime', 'response', 'imdbrating', 'title', 'genre', 'actors', 'type', 'metascore', 'filename', 'path', 'extension', 'width', 'height', 'file_size_kb', 'aspect_ratio', 'day', 'month', 'year']


I wont need all of these columsn for training so i dropped the columns i do not need.

In [81]:
dataset.drop(['_id', 'country', 'box_office', 'language', 'rated', 'awards', 
            'poster', 'director', 'released', 'actors', 'metascore', 'writer', 
            'imdbvotes', 'response', 'imdbrating', 'title', 'type', 'runtime', 'day', 'month'], 
            axis=1, inplace=True)

dataset


Unnamed: 0,imdbid,genre,filename,path,extension,width,height,file_size_kb,aspect_ratio,year
0,tt0080684,"Action, Adventure, Fantasy",tt0080684.jpg,Movie_Poster_Dataset/1980/tt0080684.jpg,jpg,300.0,444.0,32.535156,0.675676,1980.0
1,tt0081562,"Comedy, Crime",tt0081562.jpg,Movie_Poster_Dataset/1980/tt0081562.jpg,jpg,300.0,438.0,42.684570,0.684932,1980.0
2,tt0080339,Comedy,tt0080339.jpg,Movie_Poster_Dataset/1980/tt0080339.jpg,jpg,300.0,444.0,30.916016,0.675676,1980.0
3,tt0080377,"Action, Comedy",tt0080377.jpg,Movie_Poster_Dataset/1980/tt0080377.jpg,jpg,300.0,432.0,42.120117,0.694444,1980.0
4,tt0081375,"Comedy, War",tt0081375.jpg,Movie_Poster_Dataset/1980/tt0081375.jpg,jpg,300.0,444.0,30.539062,0.675676,1980.0
...,...,...,...,...,...,...,...,...,...,...
8059,tt4159182,Thriller,tt4159182.jpg,Movie_Poster_Dataset/2015/tt4159182.jpg,jpg,300.0,443.0,30.335938,0.677201,2015.0
8060,tt2265431,"Comedy, Drama, Romance",tt2265431.jpg,Movie_Poster_Dataset/2015/tt2265431.jpg,jpg,300.0,419.0,39.723633,0.715990,2015.0
8061,tt3240784,Documentary,tt3240784.jpg,Movie_Poster_Dataset/2015/tt3240784.jpg,jpg,300.0,442.0,41.201172,0.678733,2015.0
8062,tt2912776,"Drama, Music",tt2912776.jpg,Movie_Poster_Dataset/2015/tt2912776.jpg,jpg,300.0,444.0,21.246094,0.675676,2015.0


In [83]:
# Save to Excel
dataset.to_excel('./data/data.xlsx', index=False, engine='openpyxl')