In [20]:
# ! pip install wikipedia

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from openai import OpenAI
from key import OPENAI_API_KEY
import wikipedia
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

In [22]:
movie100k = pd.read_csv('../data/processed_movie100k.csv')
movie100k.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,movie_title,genres,avg_rating
0,1,1,5,874965758,Toy Story (1995),"Animation, Children",5.0
1,1,2,3,876893171,GoldenEye (1995),"Action, Adventure, Thriller",3.0
2,1,3,4,878542960,Four Rooms (1995),Thriller,4.0
3,1,4,3,876893119,Get Shorty (1995),Action,3.0
4,1,5,3,889751712,Copycat (1995),Thriller,3.0


In [23]:
movie_wiki = pd.read_csv('../data/movie_wiki.csv')
movie_wiki.head()

Unnamed: 0,movie_title,wiki_summary
0,'Til There Was You (1997),'Til There Was You is a 1997 American romantic...
1,1-900 (1994),1-900 or 06 is a 1994 Dutch erotic romantic dr...
2,101 Dalmatians (1996),101 Dalmatians is a 1996 American adventure co...
3,12 Angry Men (1957),12 Angry Men is a 1957 American legal drama fi...
4,187 (1997),One Eight Seven (also known as 187) is a 1997 ...


In [24]:
# Function to find most similar users
def get_similar_users(user_id, matrix, m):
    if user_id not in matrix.index:
        return []
    sim_users = matrix.loc[user_id].sort_values(ascending=False).iloc[1:m+1].index.tolist()
    return sim_users

def user_filtering_recommendations(dataframe, target_user_id, m, ns):
    """
    Generate movie recommendations for a target user based on user-filtering.

    :param dataframe: A pandas DataFrame containing columns 'user_id', 'movie_id', 'rating', 'movie title'.
    :param target_user_id: The user ID for whom recommendations are to be generated.
    :param m: The number of similar users to consider.
    :param ns: The number of candidate items to recommend.
    :return: A list of candidate movie titles.
    """
    # Create a pivot table
    user_movie_matrix = dataframe.pivot_table(index='user_id', columns='movie_id', values='avg_rating', fill_value=0)
    # Convert to sparse matrix
    sparse_matrix = csr_matrix(user_movie_matrix)
    # Compute Cosine Similarity
    cosine_sim = cosine_similarity(sparse_matrix)
    # Convert to DataFrame
    cosine_sim_df = pd.DataFrame(cosine_sim, index=user_movie_matrix.index, columns=user_movie_matrix.index)
    # Find similar users
    similar_users = get_similar_users(target_user_id, cosine_sim_df, m)
    # Get candidate movie IDs
    candidate_ids = dataframe[dataframe['user_id'].isin(similar_users)]['movie_id'].value_counts().head(ns).index
    # Map IDs to Titles
    candidate_titles = dataframe[dataframe['movie_id'].isin(candidate_ids)]['movie_title'].unique().tolist()
    return candidate_titles

In [25]:
df_input = []

for i in np.unique(movie100k['user_id']):
    user_watched_movies = movie100k[movie100k['user_id'] == i]['movie_title'].unique().tolist()
    np.random.seed(42)
    selected_movie = np.random.choice(user_watched_movies, min(len(user_watched_movies), 5), replace=False).tolist()
    candidate_movies = user_filtering_recommendations(movie100k, i, 10, 20)
    df_input.append({'user_id': i, 'total_watched_movies': user_watched_movies,\
                    'selected_movie': selected_movie, 'candidate_movies': candidate_movies})

df_input = pd.DataFrame(df_input)
df_input.head()

Unnamed: 0,user_id,total_watched_movies,selected_movie,candidate_movies
0,1,"[Toy Story (1995), GoldenEye (1995), Four Room...","[Crimson Tide (1995), Rock, The (1996), Hot Sh...","[Toy Story (1995), Fugitive, The (1993), Juras..."
1,2,"[Toy Story (1995), Richard III (1995), Mighty ...","[Toy Story (1995), Birdcage, The (1996), In & ...","[Twelve Monkeys (1995), Dead Man Walking (1995..."
2,3,"[Return of the Jedi (1983), Devil's Own, The (...","[Mother (1996), Alien: Resurrection (1997), Fa...","[Contact (1997), Full Monty, The (1997), Stars..."
3,4,"[Seven (Se7en) (1995), Star Wars (1977), India...","[Liar Liar (1997), Wedding Singer, The (1998),...","[Devil's Own, The (1997), Contact (1997), Even..."
4,5,"[Toy Story (1995), GoldenEye (1995), From Dusk...","[Amityville: A New Generation (1993), Harold a...","[Star Wars (1977), Stargate (1994), Blade Runn..."


In [26]:
# find the wiki summary for each movie from movie_wiki
def find_movie_summary(df_movie_wiki, movie_titles):
    summaries = []
    for i in movie_titles:
        if df_movie_wiki[df_movie_wiki['movie_title'] == i]['wiki_summary'].any():
            summary = df_movie_wiki[df_movie_wiki['movie_title'] == i]['wiki_summary'].values[0]
        else:
            summary = ""
        summaries.append(summary)
    return summaries

In [27]:
df_input['selected_movie_summary'] = df_input.apply(lambda x: find_movie_summary(movie_wiki, x['selected_movie']), axis=1)

In [28]:
df_input.head()

Unnamed: 0,user_id,total_watched_movies,selected_movie,candidate_movies,selected_movie_summary
0,1,"[Toy Story (1995), GoldenEye (1995), Four Room...","[Crimson Tide (1995), Rock, The (1996), Hot Sh...","[Toy Story (1995), Fugitive, The (1993), Juras...",[Crimson Tide is a 1995 American submarine act...
1,2,"[Toy Story (1995), Richard III (1995), Mighty ...","[Toy Story (1995), Birdcage, The (1996), In & ...","[Twelve Monkeys (1995), Dead Man Walking (1995...",[Toy Story is a 1995 American animated comedy ...
2,3,"[Return of the Jedi (1983), Devil's Own, The (...","[Mother (1996), Alien: Resurrection (1997), Fa...","[Contact (1997), Full Monty, The (1997), Stars...",[Mother is a 1996 American comedy-drama film d...
3,4,"[Seven (Se7en) (1995), Star Wars (1977), India...","[Liar Liar (1997), Wedding Singer, The (1998),...","[Devil's Own, The (1997), Contact (1997), Even...",[Liar Liar is a 1997 American satirical fantas...
4,5,"[Toy Story (1995), GoldenEye (1995), From Dusk...","[Amityville: A New Generation (1993), Harold a...","[Star Wars (1977), Stargate (1994), Blade Runn...",[Amityville: A New Generation is a 1993 direc...


In [43]:
temp = """
Candidate movies: {} \n
The movies I have watched: {} \n
Summary of the movies I have watched: {} \n
Can you recommend 10 movies from the andidate movies similar to but not in the selected movies I've watched?.
(Please apply brackets around the movie titles you recommend) \n
Answer: 
"""

In [44]:
user1 = df_input.iloc[0]
Input = temp.format(user1['candidate_movies'], user1['selected_movie'], user1['selected_movie_summary'])
print(Input)


Candidate movies: ['Toy Story (1995)', 'Fugitive, The (1993)', 'Jurassic Park (1993)', 'Nightmare Before Christmas, The (1993)', 'Terminator 2: Judgment Day (1991)', 'Silence of the Lambs, The (1991)', 'Fargo (1996)', 'Rock, The (1996)', 'Independence Day (ID4) (1996)', 'Raiders of the Lost Ark (1981)', 'Star Trek: First Contact (1996)', 'Die Hard 2 (1990)', 'Star Trek VI: The Undiscovered Country (1991)', 'Star Trek: The Wrath of Khan (1982)', 'Star Trek III: The Search for Spock (1984)', 'Star Trek IV: The Voyage Home (1986)', 'Jaws (1975)', 'Mars Attacks! (1996)', 'Men in Black (1997)', 'E.T. the Extra-Terrestrial (1982)'] 

The movies I have watched: ['Crimson Tide (1995)', 'Rock, The (1996)', 'Hot Shots! Part Deux (1993)', 'Supercop (1992)', 'Graduate, The (1967)'] 

Summary of the movies I have watched: ['Crimson Tide is a 1995 American submarine action thriller film directed by Tony Scott and produced by Don Simpson and Jerry Bruckheimer.', 'The Rock is a 1996 American action t

In [45]:
client = OpenAI(api_key=OPENAI_API_KEY)

In [46]:
response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages= [{ 'role':'user','content' : Input}],
        # temperature=0,
        # max_tokens=512,
        # top_p=1,
        # frequency_penalty=0,
        # presence_penalty=0,
        )

In [53]:
prediction = response.choices[0].message.content.split(']')[0].split('[')[1]
prediction = [movie.strip() for movie in prediction.split(',')]
prediction

['Toy Story (1995)',
 'Fugitive',
 'The (1993)',
 'Nightmare Before Christmas',
 'The (1993)',
 'Terminator 2: Judgment Day (1991)',
 'Silence of the Lambs',
 'The (1991)',
 'Fargo (1996)',
 'Independence Day (ID4) (1996)',
 'Raiders of the Lost Ark (1981)',
 'Star Trek: First Contact (1996)',
 'Mars Attacks! (1996)']

In [55]:
hit = 0
for movie in prediction:
    if movie in user1['total_watched_movies']:
        hit += 1
print(f"Hit rate: {hit/len(prediction)}")

Hit rate: 0.5384615384615384
