In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS

In [None]:
sns.set_theme(style='whitegrid', font_scale=1.1)

In [None]:
df = pd.read_csv('imdb_top_1000.csv')

In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df['Released_Year'] = pd.to_numeric(df['Released_Year'], errors='coerce').astype('Int64')
df['Runtime'] = df['Runtime'].str.replace(' min', '').astype(int)

def clean_gross(x):
    try:
        return float(x.replace(',', ''))
    except:
        return np.nan
df['Gross'] = df['Gross'].apply(clean_gross)

def split_genres(x):
    return [g.strip() for g in x.split(',')]
df['Genre_list'] = df['Genre'].apply(split_genres)
all_genres = df['Genre_list'].explode()

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(data=df, x='IMDB_Rating', y='Gross', alpha=0.6)
sns.regplot(data=df, x='IMDB_Rating', y='Gross', scatter=False, truncate=False)
plt.title('Зависимость рейтинга IMDB от кассовых сборов')
plt.xlabel('Рейтинг IMDB')
plt.ylabel('Кассовые сборы ($ США)')
plt.tight_layout()
plt.show()

In [None]:
text = ' '.join(df['Overview'].dropna().tolist())
stopwords = set(STOPWORDS)
wordcloud = WordCloud(width=800, height=400, background_color='white', stopwords=stopwords).generate(text)

plt.figure(figsize=(15,7.5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Облако тэгов обзоров на фильмы')
plt.show()

In [None]:
top5 = all_genres.value_counts().nlargest(5)
plt.figure(figsize=(8,5))
sns.barplot(x=top5.values, y=top5.index)
plt.title('Toп 5 жанров')
plt.xlabel('Количество')
plt.ylabel('Жанр')
plt.tight_layout()
plt.show()

In [None]:
counts = all_genres.value_counts()
plt.figure(figsize=(8,8))
plt.pie(counts, labels=counts.index, autopct='%1.1f%%', startangle=140)
plt.title('Распределение жанров')
plt.axis('equal')
plt.show()

In [None]:
actors = pd.Series(df[['Star1','Star2','Star3','Star4']].values.ravel()).value_counts()
top5_actors = actors.nlargest(5)
sns.barplot(x=top5_actors.values, y=top5_actors.index)
plt.title('Топ 5 актёров по количеству фильмов в топ 1000 IMDB')
plt.xlabel('Количество')
plt.ylabel('Актёр')
plt.show()


In [None]:
from collections import Counter

genres_counter = Counter(all_genres)
top5_genres = [g for g, _ in genres_counter.most_common(5)]
subset = df[df['Genre_list'].apply(lambda gl: any(g in top5_genres for g in gl))]
subset['primary_genre'] = subset['Genre_list'].apply(lambda gl: next((g for g in gl if g in top5_genres), None))

metrics = ['Runtime', 'No_of_Votes', 'Gross', 'IMDB_Rating']
palette = ["#0c09b6", "#08e0e7", '#FF0000', "#00A870", '#fed811']

fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12,8))
plt.suptitle('Распределение данных среди топ 5 жанров', fontsize=18, weight=600, color='#333d29')

for i, metric in enumerate(metrics):
    ax = axes.flatten()[i]
    sns.stripplot(data=subset, x='primary_genre', y=metric, ax=ax, palette=palette, jitter=True)
    ax.set_xlabel('Жанр')
    ax.set_ylabel(metric)
    ax.set_title(f'{metric} распределение по жанру')

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

In [None]:
corr = df[['Released_Year','Runtime','IMDB_Rating','Meta_score','No_of_Votes','Gross']].corr()
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Тепловая карта')
plt.show()