## Lumaa Spring 2025 AI/ML
### Author: Ahyo Falick

In [6]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer as Tfidf
import ast
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
# Converter from https://www.kaggle.com/code/aasthasinha9/movie-recommender-system
def convert(obj):
    l = []
    for i in ast.literal_eval(obj) :
        l.append(i['name'])
    return l 

In [4]:
# Reading in and cleaning the data
path = "https://raw.githubusercontent.com/Ahyo2/lumaa-spring-2025/refs/heads/main/tmdb_5000_movies.csv"
movies = pd.read_csv(path)
movies.dropna(inplace=True)

# Keep only relevant columns and first 500 rows
movies = movies[['genres','keywords','overview','title']]
movies = movies[0:500]

# Convert 'genres' and 'keywords' column into approriate format
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

for i,m in enumerate(movies['genres']):
    movies['genres'][movies.index[i]] = ", ".join(m)
for i,k in enumerate(movies['keywords']):
    movies['keywords'][movies.index[i]] = ", ".join(k)

In [5]:
movies

Unnamed: 0,genres,keywords,overview,title
0,"Action, Adventure, Fantasy, Science Fiction","culture clash, future, space war, space colony...","In the 22nd century, a paraplegic Marine is di...",Avatar
1,"Adventure, Fantasy, Action","ocean, drug abuse, exotic island, east india t...","Captain Barbossa, long believed to be dead, ha...",Pirates of the Caribbean: At World's End
2,"Action, Adventure, Crime","spy, based on novel, secret agent, sequel, mi6...",A cryptic message from Bond’s past sends him o...,Spectre
3,"Action, Crime, Drama, Thriller","dc comics, crime fighter, terrorist, secret id...",Following the death of District Attorney Harve...,The Dark Knight Rises
4,"Action, Adventure, Science Fiction","based on novel, mars, medallion, space travel,...","John Carter is a war-weary, former military ca...",John Carter
...,...,...,...,...
976,"Action, Adventure, Science Fiction, Thriller","prison, usa president, earthquake, dystopia, a...","This time, a cataclysmic temblor hits Los Ange...",Escape from L.A.
977,"Adventure, Animation, Family, Fantasy, Science...","cold war, friendship, giant robot, sitting on ...","In the small town of Rockwell, Maine in Octobe...",The Iron Giant
979,"War, Action, Drama, History, Thriller","slavery, american civil war","In 1863, Mississippi farmer Newt Knight serves...",Free State of Jones
983,"Thriller, Crime, Mystery","london england, gay, male nudity, female nudit...",A Russian teenager living in London who dies d...,Eastern Promises


In [8]:
# Function to get n largest row averages from a 2d array
def getmaxn(data,n):
    avg = []
    for i,d in enumerate(data):
        avg.append((i,np.mean(d)))
    avg.sort(reverse=True, key=lambda x: x[1])
    return avg[0:n]

# Get the cosine similarity for the last vector
# in a matrix compared to all others
def cosine_sim(matrix, l):
    return cosine_similarity(matrix)[l][0:l-1]

# Get the tfidf matrix of a specific column
def get_matrix(i,col,l):
    tv = Tfidf()
    m = movies[col]
    m.loc[l] = i
    return tv.fit_transform(m)

# Convert each relevant column into a tfidf vector
# Get a cosine similarity vector comparing the user input
# --- to all movies in the dataset
# Return the n largest cosine similarity scores
def recommender(movies, user_input):
    length = len(movies)
    
    matrixg = get_matrix(user_input,'genres',length)
    matrixk = get_matrix(user_input,'keywords',length)
    matrixo = get_matrix(user_input,'overview',length)

    csg = cosine_sim(matrixg,length)
    csk = cosine_sim(matrixk,length)
    cso = cosine_sim(matrixo,length)

    maxn = getmaxn(list(zip(csg, csk, cso)), 5)
    return maxn

# Function to print out the top 5 movies and their similarity scores
def printoutput(maxn,movies,user_input):
    print("-----------------------------------")
    print("Top 5 movie for: '%s'" %user_input)
    for j,m in enumerate(maxn):
        print("%i: %s, %.2f" %(j,movies['title'][movies.index[m[0]]],m[1]))

In [9]:
user_inputs = ["I love thrilling action movies set in space, with a comedic twist.",
              "Funny romance movie with a female protagonist.",
              "Tragic comedy that ends in death.",
              "I want a lighthearted children's movie to watch with my kids.",
              "Give me some insane action and violence.",
              "I love cheesy romantic comedies.",
              "I am in the mood for a holiday movie with a loveable protagonist.",
              "Something scary with a lot of gore.",
              "A thriller with a crazy twist at the end.",
              "A funny movie about aliens"]

In [10]:
for i in user_inputs:
    maxn = recommender(movies, i)
    printoutput(maxn, movies, i)

-----------------------------------
Top 5 movie for: 'I love thrilling action movies set in space, with a comedic twist.'
0: Gravity, 0.10
1: Agora, 0.06
2: Righteous Kill, 0.06
3: Avatar, 0.06
4: Space Cowboys, 0.06
-----------------------------------
Top 5 movie for: 'Funny romance movie with a female protagonist.'
0: The Tourist, 0.12
1: The Holiday, 0.09
2: Eat Pray Love, 0.08
3: It's Complicated, 0.08
4: The Break-Up, 0.08
-----------------------------------
Top 5 movie for: 'Tragic comedy that ends in death.'
0: You've Got Mail, 0.08
1: WALL·E, 0.08
2: The Other Guys, 0.08
3: Cars 2, 0.08
4: 22 Jump Street, 0.07
-----------------------------------
Top 5 movie for: 'I want a lighthearted children's movie to watch with my kids.'
0: Where the Wild Things Are, 0.06
1: The Pacifier, 0.06
2: ParaNorman, 0.05
3: Paddington, 0.05
4: Monsters, Inc., 0.05
-----------------------------------
Top 5 movie for: 'Give me some insane action and violence.'
0: Death Race, 0.06
1: The SpongeBob Mov

In [11]:
user_input = "I want a disaster movie set in the future"

In [13]:
m = recommender(movies, user_input)
printoutput(m,movies,user_input)

-----------------------------------
Top 5 movie for: 'I want a disaster movie set in the future'
0: Cloud Atlas, 0.08
1: The Time Machine, 0.08
2: Terminator Genisys, 0.08
3: Mad Max: Fury Road, 0.08
4: 2012, 0.07


#### Salary Expectations
$20-30/hour