In [2]:
import pandas as pd
import re

In [3]:
import os

def find_repo_root(start_path):
    """
    useful general function for finding the (first, closest) repo root so github file paths work the same on different machines 
    """
    current_path = os.path.abspath(start_path)
    
    while True:
        # Check for the existence of the .git directory or other indicators
        if os.path.isdir(os.path.join(current_path, '.git')) or \
           os.path.isfile(os.path.join(current_path, 'README.md')):
            return current_path
        
        parent_path = os.path.dirname(current_path)
        
        # Stop if we reach the root directory
        if parent_path == current_path:
            break
        
        current_path = parent_path

    return None  # Return None if not found

root = find_repo_root(os.getcwd())

In [6]:
df = pd.read_csv(f"{root}/Data/2020_trope_data/HIT_letterboxd_link_movies.csv")
df

Unnamed: 0,letterboxd_search,Movie
0,https://letterboxd.com/search/ABCs+of+Death+2+...,"{\n ""title"": ""abcs-of-death-2"",\n ""url"":..."
1,https://letterboxd.com/search/A+Beautiful+Day+...,"{\n ""title"": ""a-beautiful-day-in-the-neighb..."
2,https://letterboxd.com/search/A+Beautiful+Mind...,"{\n ""title"": ""a-beautiful-mind"",\n ""url""..."
3,https://letterboxd.com/search/A+Better+Tomorro...,"{\n ""title"": ""a-better-tomorrow"",\n ""url..."
4,https://letterboxd.com/search/A+Birder's+Guide...,"{\n ""title"": ""a-birders-guide-to-everything..."
...,...,...
10585,https://letterboxd.com/search/The+Day+Time+End...,"{\n ""title"": ""the-day-time-ended/"",\n ""u..."
10586,https://letterboxd.com/search/The+Men+Who+Trea...,"{\n ""title"": ""the-men-who-tread-on-the-tige..."
10587,https://letterboxd.com/search/The+Opposite+of+...,"{\n ""title"": ""the-opposite-of-sex/"",\n ""..."
10588,https://letterboxd.com/search/Top+Gun:+Maveric...,"{\n ""title"": ""top-gun-maverick/"",\n ""url..."


In [None]:
# looks good so download a second...

df_2 = f"{root}/Data/international-movies-json/international-movies-json/0a5dba0c-d6e3-4b79-a699-ccefa442b86d.json"
df_2 = pd.read_json(df_2)
print(df_2.columns)
print(df_2.shape)
df_2.head()

Index(['ImdbId', '_id', 'name', 'poster_url', 'year', 'certificate', 'runtime',
       'genre', 'ratingValue', 'summary_text', 'ratingCount', 'director',
       'cast'],
      dtype='object')
(250, 13)


Unnamed: 0,ImdbId,_id,name,poster_url,year,certificate,runtime,genre,ratingValue,summary_text,ratingCount,director,cast
0,tt10521318,tt10521318,Music for Getting Married,https://m.media-amazon.com/images/S/sash/NapCx...,2018,,,[],,Add a Plot,,,
1,tt10521312,tt10521312,Mi Hijo,https://m.media-amazon.com/images/S/sash/NapCx...,2018,,,[],,Add a Plot,,,
2,tt10521310,tt10521310,Un Tal Eduardo,https://m.media-amazon.com/images/M/MV5BNWE0OW...,2018,,,[],,Add a Plot,,"{'name': 'Aldo Garay', 'name_id': 'nm1212968'}",[]
3,tt10521308,tt10521308,Jonas Kaufmann: My Italy,https://m.media-amazon.com/images/M/MV5BOWMwND...,2017,,85 min,[Music],,Add a Plot,,,
4,tt10521306,tt10521306,Colmena,https://m.media-amazon.com/images/S/sash/NapCx...,2018,,,[],,Add a Plot,,,


In [None]:
#see if they combine painlessly

df_combo = pd.concat([df, df_2], ignore_index=True)
print(df_combo.columns)
print(df_combo.shape)

Index(['ImdbId', '_id', 'name', 'poster_url', 'year', 'certificate', 'runtime',
       'genre', 'ratingValue', 'summary_text', 'ratingCount', 'director',
       'cast'],
      dtype='object')
(500, 13)


In [None]:
## see how many folders we have
print(len([f for f in os.listdir(f'{root}/Data/international-movies-json/international-movies-json')]))

2535


In [None]:
#cool. Now we can get programmatic about it and combine them all... 


# once you run this, comment it out to save space

i=0
df_combo = pd.DataFrame()
for filename in os.listdir(f"{root}/Data/international-movies-json/international-movies-json"):
    i += 1
    print(f"processing {i} :  {filename}")
    df_combo = pd.concat(
        [
            df_combo, pd.read_json(f"{root}/Data/international-movies-json/international-movies-json/{filename}")
            ], 
         ignore_index=True)
    


In [None]:
print(df_combo.shape)
print(df_combo.columns)

(633719, 13)
Index(['ImdbId', '_id', 'name', 'poster_url', 'year', 'certificate', 'runtime',
       'genre', 'ratingValue', 'summary_text', 'ratingCount', 'director',
       'cast'],
      dtype='object')


In [None]:
## this dataframe at least initially looks pretty good so we save it as a csv for later. 
# Note that we add this to gitignore because it's too large to push back and forth. so you'll have to make it yourself by running all this code


df_combo.to_csv(f"{root}/Data/IMBD_Dataframe.csv", index=False )

In [None]:
imbb_df = pd.read_csv(f"{root}/Data/IMBD_Dataframe.csv")
imbb_df.head()

Unnamed: 0,ImdbId,_id,name,poster_url,year,certificate,runtime,genre,ratingValue,summary_text,ratingCount,director,cast
0,tt0144999,tt0144999,"Isaia, horeve",https://m.media-amazon.com/images/M/MV5BZTE2YT...,1966,,72 min,['Comedy'],4.9,A spirited wife tries to keep her husband's ma...,66,"{'name': 'Kostas Asimakopoulos', 'name_id': 'n...","[{'name': 'Vasilis Avlonitis', 'name_id': 'nm0..."
1,tt0142966,tt0142966,Voi juku - mikä lauantai,https://m.media-amazon.com/images/M/MV5BODcxMz...,1979,,81 min,['Comedy'],4.3,Add a Plot,66,"{'name': 'Visa Mäkinen', 'name_id': 'nm0617473'}","[{'name': 'Matti Pulliainen', 'name_id': 'nm07..."
2,tt0142887,tt0142887,Syöksykierre,https://m.media-amazon.com/images/M/MV5BMTA3YT...,1981,,90 min,['Drama'],5.2,Add a Plot,66,"{'name': 'Tapio Suominen', 'name_id': 'nm08394...","[{'name': 'Kimmo Liukkonen', 'name_id': 'nm051..."
3,tt0142383,tt0142383,Le huitième jour,https://m.media-amazon.com/images/M/MV5BZWUwY2...,1960,,78 min,['Drama'],6.4,"Françoise, a thirty-year-old single typist liv...",66,"{'name': 'Marcel Hanoun', 'name_id': 'nm0360528'}","[{'name': 'Emmanuelle Riva', 'name_id': 'nm072..."
4,tt0142346,tt0142346,Half Way to Hell,https://m.media-amazon.com/images/M/MV5BNmNlMD...,1960,,75 min,"['Action', ' Romance', ' Western']",2.8,"Prior to the revolution, Senorita Maria San Ca...",66,"{'name': 'Victor Adamson', 'name_id': 'nm00115...","[{'name': 'Al Adamson', 'name_id': 'nm0011467'..."


In [None]:
df_tropes = pd.read_csv(f"{root}/Data/movies-vs-tropes.csv", sep=';')
df_tropes.head()

Unnamed: 0,Movie,Tropes
0,PuertaDeHierro,"ArgentineMedia,BlackMagic,ChekhovsGun,Chekhov..."
1,BetterOffDead,"AbhorrentAdmirer,AcceptableTargets,ActorAllus..."
2,AWar,"CallBack,CigaretteOfAnxiety,CouldSayItBut,Dan..."
3,TeamAmericaWorldPolice,"ADegreeInUseless,AHeroToHisHometown,AbstractS..."
4,TheHillsHaveEyes2006,"AdaptationDistillation,AdaptationExpansion,Ad..."


In [None]:
df_tropes['Movie'] = df_tropes['Movie'].map(lambda text :  re.sub(r'(?<!^)(?=[A-Z])', ' ', text)) ## puts a space before any capital letter 
df_tropes['Movie'] = df_tropes['Movie'].map(lambda text : re.sub(r'(?<!\d)(19[0-9]{2}|20[0-1][0-9]|202[0-4])(?!\d)', r' \1', text)) #puts a space between any four year sequence (used to tell apart sequels)
df_tropes.head(30)

Unnamed: 0,Movie,Tropes
0,Puerta De Hierro,"ArgentineMedia,BlackMagic,ChekhovsGun,Chekhov..."
1,Better Off Dead,"AbhorrentAdmirer,AcceptableTargets,ActorAllus..."
2,A War,"CallBack,CigaretteOfAnxiety,CouldSayItBut,Dan..."
3,Team America World Police,"ADegreeInUseless,AHeroToHisHometown,AbstractS..."
4,The Hills Have Eyes 2006,"AdaptationDistillation,AdaptationExpansion,Ad..."
5,The Face Of Fu Manchu,"ActuallyADoombot,AffablyEvil,ArchEnemy,Awesom..."
6,The Trip To Bountiful,"BittersweetEnding,EarnYourHappyEnding,ElderAb..."
7,Frantz,"AllLoveIsUnrequited,AnArmAndALeg,DaydreamSurp..."
8,Bad Company 2002,"BackUpTwin,BackupTwin,CoupDeGrace,EmergencyIm..."
9,The Last Stand,"ActionGirl,ArtisticLicenseCars,ArtisticLicens..."


In [None]:
## we don't expect everything but lets see how far we get with just merging now...

imbb_df.rename(columns=
               {'name' : 'Movie'},
               inplace=True
               )
both_df = imbb_df.merge(df_tropes, on='Movie')
print(both_df.shape)
print(both_df.columns)
both_df.head()

(7219, 14)
Index(['ImdbId', '_id', 'Movie', 'poster_url', 'year', 'certificate',
       'runtime', 'genre', 'ratingValue', 'summary_text', 'ratingCount',
       'director', 'cast', ' Tropes'],
      dtype='object')


Unnamed: 0,ImdbId,_id,Movie,poster_url,year,certificate,runtime,genre,ratingValue,summary_text,ratingCount,director,cast,Tropes
0,tt0103231,tt0103231,Waiting,https://m.media-amazon.com/images/M/MV5BOTNkNz...,1991,,95 min,['Drama'],7.0,A teenage boy wants to leave his home town.,66,"{'name': 'Jackie McKimmie', 'name_id': 'nm0571...","[{'name': 'Noni Hazlehurst', 'name_id': 'nm037...","AlmightyJanitor,AlwaysSomeoneBetter,AmIRight,..."
1,tt0031829,tt0031829,Pukar,https://m.media-amazon.com/images/M/MV5BMjRlMW...,1939,Not Rated,165 min,"['Drama', ' History']",7.0,A love affair and two feuding families who pla...,66,"{'name': 'Sohrab Modi', 'name_id': 'nm0595271'}","[{'name': 'Sohrab Modi', 'name_id': 'nm0595271...","AFatherToHisMen,AvengingTheVillain,BattleCoup..."
2,tt3512976,tt3512976,Hustlers,https://m.media-amazon.com/images/M/MV5BMTgzNz...,2014,,105 min,['Drama'],7.6,A street smart clubowner is frustrated with th...,21,"{'name': 'Elvis Chuks', 'name_id': 'nm2256467'}","[{'name': 'Clarion Chukwurah', 'name_id': 'nm6...","AMistakeIsBorn,ActorAllusion,AdultFear,Advert..."
3,tt2712502,tt2712502,Reckless,https://m.media-amazon.com/images/M/MV5BOWNhZW...,2013 TV Movie,,60 min,['Drama'],7.3,"Centers on a resourceful problem solver who, w...",21,"{'name': 'Martin Campbell', 'name_id': 'nm0132...","[{'name': 'Ernie Hudson', 'name_id': 'nm000136...","AmusementPark,AsYouKnow,DramaticIrony,DrivenT..."
4,tt2982800,tt2982800,Leningrad,https://m.media-amazon.com/images/M/MV5BZTE5Nj...,2014,,137 min,"['Drama', ' History', ' War']",6.0,Add a Plot,7,"{'name': 'Igor Vishnevetsky', 'name_id': 'nm57...","[{'name': 'Pavel Bykov', 'name_id': 'nm5731032...","BattleInTheRain,BittersweetEnding,BookBurning..."


NameError: name 'pd' is not defined