In [1]:
import pandas as pd

# Reading in CSV 

In [2]:
df = pd.read_csv('./movie_rec/TMDb_updated.CSV')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         10000 non-null  int64  
 1   title              10000 non-null  object 
 2   overview           9970 non-null   object 
 3   original_language  10000 non-null  object 
 4   vote_count         10000 non-null  int64  
 5   vote_average       10000 non-null  float64
dtypes: float64(1), int64(2), object(3)
memory usage: 468.9+ KB


In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,overview,original_language,vote_count,vote_average
0,0,Ad Astra,"The near future, a time when both hope and har...",en,2853,5.9
1,1,Bloodshot,"After he and his wife are murdered, marine Ray...",en,1349,7.2
2,2,Bad Boys for Life,Marcus and Mike are forced to confront new thr...,en,2530,7.1
3,3,Ant-Man,Armed with the astonishing ability to shrink i...,en,13611,7.1
4,4,Percy Jackson: Sea of Monsters,"In their quest to confront the ultimate evil, ...",en,3542,5.9


In [5]:
df_untransformed = pd.read_csv('./movie_rec/TMDb_updated.CSV')

# Cleaning the data and converting the data

## Remove unneeded data, convert columns. 
## Strip text
### 1. Regex
### 2. Removing stop words
### 3. Lower case the text. 

In [6]:
df.drop('Unnamed: 0',inplace=True, axis=1)
df.drop('original_language', inplace=True, axis=1)
df.dropna(inplace=True)
df['vote_count']=df['vote_count'].astype(int)
df['vote_average']=df['vote_average'].astype(int)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9970 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         9970 non-null   object
 1   overview      9970 non-null   object
 2   vote_count    9970 non-null   int64 
 3   vote_average  9970 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 389.5+ KB


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

df['text'] = df['title'] + ' ' + df['overview']


vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(df['text'])


new_overview = "A group of friends go on a road trip across the country."
new_tfidf = vectorizer.transform([new_overview])
similarity_scores = cosine_similarity(tfidf, new_tfidf)


top10_indices = similarity_scores.flatten().argsort()[:-11:-1]


top10_titles = df['title'].iloc[top10_indices].tolist()

print(top10_titles)


['Bad Trip', 'Road Trip', 'The Road Within', 'Sole a catinelle', 'The Lucky Ones', 'EuroTrip', 'Becoming', 'The Ritual', 'College Road Trip', 'The Guilt Trip']


In [9]:
def rec_10():
    description = str(input())
    new_overview = description
    new_tfidf = vectorizer.transform([new_overview])
    similarity_scores = cosine_similarity(tfidf, new_tfidf)


    top10_indices = similarity_scores.flatten().argsort()[:-11:-1]

 
    top10_titles = df['title'].iloc[top10_indices].tolist()
    top10_titles_and_descriptions = []
    for i in top10_titles:
        try:
            info = df_untransformed[df['title']== i]['overview'].values[0]
            top10_titles_and_descriptions.append((i,info))
        except: 
            info = df_untransformed[df_untransformed['title']==i]['overview']
            top10_titles_and_descriptions.append((i,info))
            
    cleaned = []
    
    for i in top10_titles_and_descriptions:
        desc=i[1].values[0]
        title=i[0]
        cleaned.append((title,desc))
        
    return cleaned

In [10]:
a = rec_10()

Two brothers who have a strong bond 


  info = df_untransformed[df['title']== i]['overview'].values[0]


In [11]:
a

[('Never Say Never Again',
  "James Bond returns as the secret agent 007 one more time to battle the evil organization SPECTRE. Bond must defeat Largo, who has stolen two atomic warheads for nuclear blackmail. But Bond has an ally in Largo's girlfriend, the willowy Domino, who falls for Bond and seeks revenge. This is the last time for Sean Connery as Her Majesty's Secret Agent 007. Made outside of the traditional Broccoli production environment due to separate rights having been obtained for this specific Ian Fleming story."),
 ('The Darjeeling Limited',
  'Three American brothers who have not spoken to each other in a year set off on a train voyage across India with a plan to find themselves and bond with each other -- to become brothers again like they used to be. Their "spiritual quest", however, veers rapidly off-course (due to events involving over-the-counter pain killers, Indian cough syrup, and pepper spray).'),
 ('The Prince of Egypt',
  'This is the extraordinary tale of two