In [1]:
!pip install -qq convokit

In [2]:
!pip install -qq scikit-surprise



In [3]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
import convokit

In [4]:
# Load the Cornell Movie Dialogs Corpus
corpus = convokit.Corpus(filename=convokit.download("movie-corpus"))

Downloading movie-corpus to C:\Users\akhil\.convokit\downloads\movie-corpus
Downloading movie-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip (40.9MB)... Done
No configuration file found at C:\Users\akhil/.convokit/config.yml; writing with contents: 
# Default Backend Parameters
db_host: localhost:27017
data_directory: ~/.convokit/saved-corpora
default_backend: mem


In [7]:

# Display basic statistics
print("No: of conversations:", len(corpus.conversations))
print("No: of users:", len(corpus.speakers))
print("No: of utterances:", len(corpus.utterances))

# Create empty lists to store data
conversation_ids = []
movie_indices = []
movie_names = []
release_years = []
ratings = []
votes = []
genres = []
num_utterances = []

# Loop through conversations and extract data
for convo_id in corpus.get_conversation_ids():
    convo = corpus.get_conversation(convo_id)

    # Extract metadata from ConvoKitMeta object
    metadata = convo.meta

    # Append data to respective lists
    conversation_ids.append(convo_id)
    movie_indices.append(metadata['movie_idx'])
    movie_names.append(metadata['movie_name'])
    release_years.append(metadata['release_year'])
    ratings.append(metadata['rating'])
    votes.append(metadata['votes'])
    genres.append(metadata['genre'])
    num_utterances.append(len(convo.get_utterance_ids()))

# Create a DataFrame from the lists
data = {
    'Conversation ID': conversation_ids,
    'Movie Index': movie_indices,
    'Movie Name': movie_names,
    'Rating': ratings,
}


No: of conversations: 83097
No: of users: 9035
No: of utterances: 304713


In [8]:
movie_ratings_df = pd.DataFrame(data)

# Create a Reader object specifying the rating scale
reader = Reader(rating_scale=(1, 10))

# Load the dataset into Surprise format
data_surprise = movie_ratings_df[['Conversation ID', 'Movie Name', 'Rating']]
data = Dataset.load_from_df(data_surprise, reader)


In [9]:
# Split the data into training and testing sets (80% train, 20% test)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Create and train the SVD recommendation model
model = SVD()
model.fit(trainset)

# Evaluate the model on the testing data (calculate RMSE)
predictions = model.test(testset)
rmse = accuracy.rmse(predictions)




RMSE: 0.0639


In [10]:
# Generate recommendations for a specific user (e.g., user_id='L236416')
user_id = 'L36547'
user_movies = movie_ratings_df[movie_ratings_df['Conversation ID'] == user_id]['Movie Name'].unique()

# Create a list of unrated movies for the user
all_movies = movie_ratings_df['Movie Name'].unique()
unrated_movies = np.setdiff1d(all_movies, user_movies)

# Generate predictions for unrated movies
user_recommendations = []
for movie_id in unrated_movies:
    predicted_rating = model.predict(user_id, movie_id).est
    user_recommendations.append((movie_id, predicted_rating))

# Sort recommendations by predicted rating (highest first)
user_recommendations.sort(key=lambda x: x[1], reverse=True)

# Display the top n recommendations
top_n = 3
print(f'Top {top_n} recommendations for user {user_id}:')
for movie_id, predicted_rating in user_recommendations[:top_n]:
    print(f'Movie ID: {movie_id}, Predicted Rating: {predicted_rating:.2f}')


Top 3 recommendations for user L36547:
Movie ID: neuromancer, Predicted Rating: 9.16
Movie ID: the godfather, Predicted Rating: 9.03
Movie ID: the godfather: part ii, Predicted Rating: 8.87
