In [1]:
import pandas as pd
import re
import numpy as np
import json

In [2]:
%ls 'archive/'

tmdb_5000_credits.csv  tmdb_5000_movies.csv


In [3]:
df = pd.read_csv('archive/tmdb_5000_movies.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [5]:
def find_genre(text):
    pattern = r'\w+\s?\w+'
    out = re.findall(pattern, text)
    return out[3] if len(out) != 0 else np.nan

In [6]:
df['Genre'] = df['genres'].apply(find_genre) # взяли первый жанр, далее буду через json_loads делать

In [7]:
df['Genre'].unique()

array(['Action', 'Adventure', 'Fantasy', 'Animation', 'Science Fiction',
       'Drama', 'Thriller', 'Family', 'Comedy', 'History', 'War',
       'Western', 'Romance', 'Crime', 'Mystery', 'Horror', 'Documentary',
       'Music', 'TV Movie', nan, 'Foreign'], dtype=object)

In [8]:
columns = df.columns.to_list() # переведем все колонки в Title
columns = list(map(lambda x: x.title(), columns))
df.columns = columns

In [9]:
df['Release Year'] = df['Release_Date'].apply(lambda x: re.search(r'\d{4}', str(x)).group() 
                                              if re.search(r'\d{4}', str(x)) != None else np.nan) # нашли год релиза

In [10]:
df['Original_Language'].unique() # приведем все языки через ISO 639

array(['en', 'ja', 'fr', 'zh', 'es', 'de', 'hi', 'ru', 'ko', 'te', 'cn',
       'it', 'nl', 'ta', 'sv', 'th', 'da', 'xx', 'hu', 'cs', 'pt', 'is',
       'tr', 'nb', 'af', 'pl', 'he', 'ar', 'vi', 'ky', 'id', 'ro', 'fa',
       'no', 'sl', 'ps', 'el'], dtype=object)

In [11]:
df['Original_Language'] = df['Original_Language'].apply(lambda x: 'zh' if x == 'cn' else x) # привел к ISO
df['Original_Language'] = df['Original_Language'].apply(lambda x: 'fr' if x == 'xx' else x) # был один фильм

In [12]:
lang_df = pd.read_html('https://snipp.ru/handbk/iso-639-2', encoding='utf-8')[0]

In [13]:
df['Origin/Ethnicity'] = df['Original_Language'].apply(lambda x: lang_df[lang_df['ISO 639-1'] == x]['EN'].values[-1])

In [14]:
df['Plot'] = df['Overview']

In [15]:
df.head(3)

Unnamed: 0,Budget,Genres,Homepage,Id,Keywords,Original_Language,Original_Title,Overview,Popularity,Production_Companies,...,Spoken_Languages,Status,Tagline,Title,Vote_Average,Vote_Count,Genre,Release Year,Origin/Ethnicity,Plot
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Action,2009,English,"In the 22nd century, a paraplegic Marine is di..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Adventure,2007,English,"Captain Barbossa, long believed to be dead, ha..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Action,2015,English,A cryptic message from Bond’s past sends him o...


In [16]:
pd.set_option('max_columns', None)

In [17]:
columns = df.columns.to_list()
columns[2] = 'Url'
df.columns = columns

In [18]:
%ls archive/

tmdb_5000_credits.csv  tmdb_5000_movies.csv


In [19]:
df_credits = pd.read_csv('archive/tmdb_5000_credits.csv')
df_credits['crew'] = df_credits['crew'].apply(lambda x: json.loads(x)) # сразу переделаем в словари
df_credits['cast'] = df_credits['cast'].apply(lambda x: json.loads(x))

In [20]:
df_credits.head(5)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{'cast_id': 242, 'character': 'Jake Sully', '...","[{'credit_id': '52fe48009251416c750aca23', 'de..."
1,285,Pirates of the Caribbean: At World's End,"[{'cast_id': 4, 'character': 'Captain Jack Spa...","[{'credit_id': '52fe4232c3a36847f800b579', 'de..."
2,206647,Spectre,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '54805967c3a36829b5002c41', 'de..."
3,49026,The Dark Knight Rises,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...","[{'credit_id': '52fe4781c3a36847f81398c3', 'de..."
4,49529,John Carter,"[{'cast_id': 5, 'character': 'John Carter', 'c...","[{'credit_id': '52fe479ac3a36847f813eaa3', 'de..."


In [21]:
def get_cast_names(x):
    casts = []
    for i in x:
        casts.append(i.get('name'))
    return ', '.join(casts)

In [22]:
df_credits['Cast'] = df_credits['cast'].apply(get_cast_names)

In [23]:
def get_director(x):
    for i in x:
        if i.get('job') == 'Director':
            return i.get('name')

In [24]:
df_credits['Director'] = df_credits['crew'].apply(get_director)

In [25]:
df_credits.set_index('movie_id', inplace=True) 

In [26]:
df_with_credits = df.join(df_credits, on='Id', rsuffix= '_cred') # теперь все объединим

In [27]:
df_with_credits.head(3)

Unnamed: 0,Budget,Genres,Url,Id,Keywords,Original_Language,Original_Title,Overview,Popularity,Production_Companies,Production_Countries,Release_Date,Revenue,Runtime,Spoken_Languages,Status,Tagline,Title,Vote_Average,Vote_Count,Genre,Release Year,Origin/Ethnicity,Plot,title,cast,crew,Cast,Director
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Action,2009,English,"In the 22nd century, a paraplegic Marine is di...",Avatar,"[{'cast_id': 242, 'character': 'Jake Sully', '...","[{'credit_id': '52fe48009251416c750aca23', 'de...","Sam Worthington, Zoe Saldana, Sigourney Weaver...",James Cameron
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Adventure,2007,English,"Captain Barbossa, long believed to be dead, ha...",Pirates of the Caribbean: At World's End,"[{'cast_id': 4, 'character': 'Captain Jack Spa...","[{'credit_id': '52fe4232c3a36847f800b579', 'de...","Johnny Depp, Orlando Bloom, Keira Knightley, S...",Gore Verbinski
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Action,2015,English,A cryptic message from Bond’s past sends him o...,Spectre,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '54805967c3a36829b5002c41', 'de...","Daniel Craig, Christoph Waltz, Léa Seydoux, Ra...",Sam Mendes


In [28]:
# теперь расширим датасет

In [29]:
df2 = pd.read_csv('wiki_movie_plots_deduped.csv')

In [30]:
df2_columns = df2.columns.to_list()
df2_columns[-2] = 'Url'
df2.columns = df2_columns

In [31]:
df2.head(4)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Url,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...


In [32]:
to_concat = [df2,df_with_credits[df2_columns]]

In [33]:
df_itog = pd.concat(to_concat)

In [34]:
df_itog = df_itog.reset_index().drop('index', axis=1)

In [37]:
df_itog.to_csv('extended_dataset.csv')

In [36]:
%ls

Genres.csv
Itog_research.ipynb
[34marchive[m[m/
archive.zip
extended_dataset.csv
kinopoisk-top250.csv
wiki_movie_plots_deduped.csv
Кинопоиск.ipynb
Марат Гарафутдинов. Школа DS. Итоговый проект (1).docx
Расширение датасета.ipynb
