In [252]:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
from collections import Counter
import ast
from collections import Counter
from sklearn.preprocessing import OneHotEncoder

In [253]:
path = "archive-2/Data/all_years"
datasets = []

In [254]:
for i in range(2000, 2026):
    file = f'{path}/merged_movies_data_{i}.csv'
    data = pd.read_csv(file)
    datasets.append(data)
dataset = pd.concat(datasets, ignore_index=True)


In [255]:

dataset.to_csv('combined_dataset.csv', index=False)

In [256]:
dataset = dataset.drop_duplicates()

In [257]:
dataset = dataset[~dataset['méta_score'].isna()].reset_index(drop=True)
dataset = dataset[~dataset['production_company'].isna()].reset_index(drop=True)
dataset = dataset[~dataset['stars'].isna()].reset_index(drop=True)
dataset = dataset[~dataset['MPA'].isna()].reset_index(drop=True)
print(len(dataset))
print(dataset['méta_score'].dtype)

9997
float64


In [258]:
dataset = dataset[~dataset['Rating'].isna()].reset_index(drop=True)

dataset= dataset.drop(columns=['gross_US_Canada','opening_weekend_Gross','budget', 'filming_locations'])
print(dataset.isna().sum())

Title                    0
Year                     0
Duration                 0
MPA                      0
Rating                   0
Votes                    0
méta_score               0
description              0
Movie Link               0
writers                  0
directors                0
stars                    0
grossWorldWWide        428
release_date             0
countries_origin         0
production_company       0
awards_content        1190
genres                   0
Languages                0
dtype: int64


In [259]:
def convert_duration(duration):
    if pd.isna(duration):
        return np.nan
    hours = re.search(r'(\d+)h', duration)
    minutes = re.search(r'(\d+)m', duration)
    h = int(hours.group(1)) if hours else 0
    m = int(minutes.group(1)) if minutes else 0
    return h * 60 + m
    

In [260]:
dataset['Duration'] = dataset['Duration'].apply(convert_duration)


In [261]:
def convert_votes(votes):
    if 'K' in votes:
        num = float(re.search(r'([\d.]+)', votes).group(1)) * 1000
    elif 'M' in votes:
        num = float(re.search(r'([\d.]+)', votes).group(1)) * 1000000
    else:
        num = float(votes)
    return num
dataset['Votes'] = dataset['Votes'].apply(convert_votes)

In [262]:
dataset['Duration'] = dataset['Duration'].fillna(dataset['Duration'].median())

In [263]:
def convert_gross(gross):
    if pd.isna(gross):
        return np.nan
    gross = int(gross.replace('$', '').replace(',',''))
    return gross

dataset['grossWorldWWide'] = dataset['grossWorldWWide'].apply(convert_gross)

In [264]:

dataset['grossWorldWWide'] = dataset['grossWorldWWide'].fillna(dataset['grossWorldWWide'].median())


In [265]:
all_actors = []
dataset['stars'] = dataset['stars'].apply(ast.literal_eval)
for actors in dataset['stars']:
    for actor in actors:
        all_actors.append(actor)

actor_counts = Counter(all_actors)
top_20 = actor_counts.most_common(100)

with pd.option_context('display.max_rows', None):
    print(top_actors)

                  Actor  Count
0     Samuel L. Jackson     62
1       Woody Harrelson     57
2          Willem Dafoe     55
3           Liam Neeson     52
4          J.K. Simmons     49
5         Nicole Kidman     49
6         Mark Wahlberg     46
7        Morgan Freeman     46
8        Cate Blanchett     46
9        Julianne Moore     46
10   Scarlett Johansson     46
11         Nicolas Cage     45
12           Bill Nighy     45
13       John Leguizamo     44
14       Robert De Niro     43
15          Owen Wilson     43
16           Matt Damon     42
17         Alec Baldwin     42
18        Colin Farrell     42
19          Naomi Watts     42
20        Ryan Reynolds     42
21       Rosario Dawson     41
22         James Franco     41
23          Ethan Hawke     41
24      Kristen Stewart     41
25        Paul Giamatti     40
26      Richard Jenkins     40
27        Tilda Swinton     40
28        Steve Buscemi     40
29        Jason Statham     40
30       Susan Sarandon     40
31      

In [266]:
encoder = OneHotEncoder(sparse=False)
encoded_mpa = encoder.fit_transform(dataset[['MPA']])
encoded_mpa_df = pd.DataFrame(encoded_mpa, columns=encoder.get_feature_names_out(['MPA']))
dataset = dataset.drop(columns=['MPA']).reset_index(drop=True)
dataset = pd.concat([dataset, encoded_mpa_df], axis=1)


In [268]:
 print("Encoded columns:", encoded_mpa_df.columns.tolist())
print(encoded_mpa_df.head())

# Проверим, что сумма по строкам равна 1 (one-hot encoding)
print(encoded_mpa_df.sum(axis=1).value_counts())

Encoded columns: ['MPA_13+', 'MPA_16+', 'MPA_18+', 'MPA_Approved', 'MPA_G', 'MPA_MA-17', 'MPA_NC-17', 'MPA_Not Rated', 'MPA_PG', 'MPA_PG-13', 'MPA_R', 'MPA_TV-14', 'MPA_TV-G', 'MPA_TV-MA', 'MPA_TV-PG', 'MPA_TV-Y7', 'MPA_Unrated']
   MPA_13+  MPA_16+  MPA_18+  MPA_Approved  MPA_G  MPA_MA-17  MPA_NC-17  \
0      0.0      0.0      0.0           0.0    0.0        0.0        0.0   
1      0.0      0.0      0.0           0.0    0.0        0.0        0.0   
2      0.0      0.0      0.0           0.0    0.0        0.0        0.0   
3      0.0      0.0      0.0           0.0    0.0        0.0        0.0   
4      0.0      0.0      0.0           0.0    0.0        0.0        0.0   

   MPA_Not Rated  MPA_PG  MPA_PG-13  MPA_R  MPA_TV-14  MPA_TV-G  MPA_TV-MA  \
0            0.0     1.0        0.0    0.0        0.0       0.0        0.0   
1            0.0     0.0        1.0    0.0        0.0       0.0        0.0   
2            0.0     0.0        1.0    0.0        0.0       0.0        0.0   
3      

In [271]:
dataset['writers'] = dataset['writers'].apply(ast.literal_eval)

In [273]:
#dataset['director'] = dataset['director'].apply(ast.literal_eval)
dataset['countries_origin'] = dataset['countries_origin'].apply(ast.literal_eval)
dataset['production_company'] = dataset['production_company'].apply(ast.literal_eval)
dataset['genres'] = dataset['genres'].apply(ast.literal_eval)
dataset['Languages'] = dataset['Languages'].apply(ast.literal_eval)

In [275]:
dataset['directors'] = dataset['directors'].apply(ast.literal_eval)

In [None]:
all_genres