In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/firdavs/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/firdavs/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
df = pd.read_csv('movie_dataset.csv')

In [3]:
df.head()

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,Inception,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc..."
1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg,...,Interstellar,The adventures of a group of explorers who mak...,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,..."
2,155,The Dark Knight,8.512,30619,Released,2008-07-16,1004558444,152,False,/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg,...,The Dark Knight,Batman raises the stakes in his war on crime. ...,130.643,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,Welcome to a world without rules.,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin","joker, sadism, chaos, secret identity, crime f..."
3,19995,Avatar,7.573,29815,Released,2009-12-15,2923706026,162,False,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,...,Avatar,"In the 22nd century, a paraplegic Marine is di...",79.932,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,Enter the world of Pandora.,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish","future, society, culture clash, space travel, ..."
4,24428,The Avengers,7.71,29166,Released,2012-04-25,1518815515,143,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,...,The Avengers,When an unexpected enemy emerges and threatens...,98.082,/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg,Some assembly required.,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian","new york city, superhero, shield, based on com..."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1124109 entries, 0 to 1124108
Data columns (total 24 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   id                    1124109 non-null  int64  
 1   title                 1124096 non-null  object 
 2   vote_average          1124109 non-null  float64
 3   vote_count            1124109 non-null  int64  
 4   status                1124109 non-null  object 
 5   release_date          942689 non-null   object 
 6   revenue               1124109 non-null  int64  
 7   runtime               1124109 non-null  int64  
 8   adult                 1124109 non-null  bool   
 9   backdrop_path         299466 non-null   object 
 10  budget                1124109 non-null  int64  
 11  homepage              119303 non-null   object 
 12  imdb_id               599422 non-null   object 
 13  original_language     1124109 non-null  object 
 14  original_title        1124096 non-

In [5]:
len(df[df['vote_average'] > 9])

27144

In [6]:
temp_df = df[
    (df['runtime'] >= df['runtime'].mean()) & 
    (df['vote_count'] >= df['vote_count'].mean()) & 
    (df['vote_average'] >= 5)]

filtered_df = temp_df[temp_df['popularity'] >= temp_df['popularity'].mean()].reset_index(drop=True)

In [7]:
index_data = filtered_df[['id', 'title', 'vote_average', 'overview', 'genres', 'keywords']].copy()

def replace_nan_with_empty(df):
    nan_count = df.isna().sum().sum()
    df.fillna("empty", inplace=True)
    print(f"Total NaN values corrected: {nan_count}")
    return df

index_data = replace_nan_with_empty(index_data)

Total NaN values corrected: 758


In [8]:
movie_genres = {
    'Action': 28, 'Adventure': 12, 'Animation': 16, 'Comedy': 35, 'Crime': 80, 'Documentary': 99,
    'Drama': 18, 'Family': 10751, 'Fantasy': 14, 'History': 36, 'Horror': 27, 'Music': 10402,
    'Mystery': 9648, 'Romance': 10749, 'Science Fiction': 878, 'TV Movie': 10770, 'Thriller': 53,
    'War': 10752, 'Western': 37
}

def convert_genres(genre_str):
    if pd.isna(genre_str):
        return []
    return [movie_genres[genre.strip()] for genre in genre_str.split(', ') if genre.strip() in movie_genres]

filtered_df['genres'] = filtered_df['genres'].apply(convert_genres)

In [9]:
filtered_df['keywords'] = filtered_df['keywords'].apply(lambda x: x.split(', ') if isinstance(x, str) else [])

In [10]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def create_and_preprocess_tags(row):
    # Создание строки с объединёнными значениями
    genre_str = ' '.join(map(str, row['genres']))
    keywords_str = ' '.join(row['keywords'])
    title_str = row['title']
    overview_str = row['overview']
    combined_text = f"Genre {genre_str} Title {title_str} Overview {overview_str} Keywords {keywords_str}"

    combined_text = re.sub(r'[^\w\s]', '', combined_text.lower())
    
    # Удаление стоп-слов и лемматизация
    words = [lemmatizer.lemmatize(word) for word in combined_text.split() if word not in stop_words]

    return ' '.join(words)

filtered_df['tags'] = filtered_df.apply(create_and_preprocess_tags, axis=1)

data_dict = index_data.set_index('id').to_dict(orient='index')
print(filtered_df['tags'].head(5))

0    genre 28 878 12 title inception overview cobb ...
1    genre 12 18 878 title interstellar overview ad...
2    genre 18 28 80 53 title dark knight overview b...
3    genre 28 12 14 878 title avatar overview 22nd ...
4    genre 878 28 12 title avenger overview unexpec...
Name: tags, dtype: object


In [11]:
indices = pd.Series(filtered_df.index, index=filtered_df['id'])

# Count Vectorizer

In [12]:
cv = CountVectorizer().fit_transform(filtered_df['tags'])

In [13]:
def get_recommendations(movie_title):
    idx = filtered_df[filtered_df['title'].str.contains(movie_title, case=False)].index
    if len(idx) == 0:
        print("Фильм не найден. Попробуйте другое название.")
        return

    idx = idx[0]  # Получаем индекс первого совпадения
    similarity_scores = cosine_similarity(cv[idx], cv).flatten()
    similar_movie_indices = similarity_scores.argsort()[-10:-1][::-1]

    return filtered_df['title'].iloc[similar_movie_indices].tolist()

In [14]:
get_recommendations("Toy Story")

['Toy Story 2',
 'Toy Story 4',
 'Toy Story 3',
 'Small Soldiers',
 'Toys',
 'Welcome to Marwen',
 'Buzz Lightyear of Star Command: The Adventure Begins',
 'The Indian in the Cupboard',
 'The Beanie Bubble']

# Calculate Metrics

In [15]:
def calculate_precision_at_k(actual, predicted, k):
    """
    Вычисляет Precision@K
    """
    if len(predicted) > k:
        predicted = predicted[:k]
    
    score = len(set(actual) & set(predicted)) / k if k > 0 else 0
    return score

def calculate_mrr(actual, predicted):
    """
    Вычисляет Mean Reciprocal Rank (MRR)
    """
    for rank, item in enumerate(predicted, 1):
        if item in actual:
            return 1.0 / rank
    return 0.0

def calculate_map_at_k(actual, predicted, k):
    """
    Вычисляет Mean Average Precision at K (MAP@K)
    """
    if len(predicted) > k:
        predicted = predicted[:k]
    
    score = 0.0
    num_hits = 0.0
    
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1
            score += num_hits / (i + 1)
    
    if not actual:
        return 0.0
    
    return score / min(len(actual), k)

In [16]:
def evaluate_metrics(filtered_df, k=5, test_size=0.2, random_state=42):
    """
    Вычисляет все метрики ранжирования
    """
    train_df, test_df = train_test_split(filtered_df, test_size=test_size, random_state=random_state)
    
    def get_relevant_movies(movie_title):
        """Получает релевантные фильмы на основе жанров"""
        movie_genres = set(filtered_df[filtered_df['title'] == movie_title]['genres'].iloc[0])
        relevant = []
        
        for idx, row in filtered_df.iterrows():
            if row['title'] != movie_title:
                if len(set(row['genres']) & movie_genres) >= len(movie_genres) * 0.5:
                    relevant.append(row['title'])
        
        return relevant

    metrics = {
        'precision@k': [],
        'mrr': [],
        'map@k': []
    }
    
    for title in test_df['title'].sample(n=min(100, len(test_df)), random_state=random_state):
        try:
            recommendations = get_recommendations(title)
            relevant_movies = get_relevant_movies(title)
            
            metrics['precision@k'].append(calculate_precision_at_k(relevant_movies, recommendations, k))
            metrics['mrr'].append(calculate_mrr(relevant_movies, recommendations))
            metrics['map@k'].append(calculate_map_at_k(relevant_movies, recommendations, k))
            
        except Exception as e:
            print(f"Error evaluating {title}: {str(e)}")
    
    average_metrics = {
        f'Precision@{k}': np.mean(metrics['precision@k']),
        'MRR': np.mean(metrics['mrr']),
        f'MAP@{k}': np.mean(metrics['map@k'])
    }
    
    return average_metrics

In [17]:
metrics = evaluate_metrics(filtered_df, k=5)

In [18]:
df_metrics = pd.DataFrame(list(metrics.items()), columns=['Metric', 'Value'])
df_metrics.head()

Unnamed: 0,Metric,Value
0,Precision@5,0.716
1,MRR,0.840333
2,MAP@5,0.650867


# Save variables

In [19]:
#with open('cv.pkl', 'wb') as f:
#    pickle.dump(cv, f)

In [20]:
#with open('indices.pkl', 'wb') as f:
#    pickle.dump(indices, f)

In [21]:
#with open('movies_list.pkl', 'wb') as f:
#    pickle.dump(filtered_df, f)