# scrape data

- title
- plot
- genre

In [1]:
import pandas as pd
import functions

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('data/s.csv')
df.head()

Unnamed: 0,Title,Year,Genres,Plots,Processed_feature
0,02:22,2017,"Drama,Mystery,Science Fiction,Thriller",A man s life is derailed when an ominous patte...,man life derail omin pattern event repeat exac...
1,85,2020,"Drama,Fantasy",What do you dream of when you re 16 years old ...,z
2,9,2009,"Adventure,Animation,Science Fiction",When 9 first comes to life he finds himself i...,when first come life find post apocalypt world...
3,21,2008,"Crime,Drama,Romance,Thriller",An embattled NYPD detective is thrust into a ...,embattl nypd detect thrust citywid manhunt pai...
4,31,2016,"Horror,Thriller",Five carnival workers are kidnapped and held h...,five carniv worker kidnap held hostag abandon ...


In [3]:
df.shape

(9338, 5)

## movies not in the dataframe

In [4]:
import tmdbsimple as tmdb
import os

In [5]:
api_key = os.getenv("MOVIE_API_KEY")
tmdb.API_KEY = api_key

In [6]:
search = tmdb.Search()

In [39]:
response = search.movie(query='eternals')

In [40]:
response

{'page': 1,
 'results': [{'adult': False,
   'backdrop_path': '/3G6wET9eLvYn3aoIj8NfQFhpYEB.jpg',
   'genre_ids': [28, 12, 878],
   'id': 524434,
   'original_language': 'en',
   'original_title': 'Eternals',
   'overview': 'The Eternals are a team of ancient aliens who have been living on Earth in secret for thousands of years. When an unexpected tragedy forces them out of the shadows, they are forced to reunite against mankind’s most ancient enemy, the Deviants.',
   'popularity': 2026.37,
   'poster_path': '/6AdXwFTRTAzggD2QUTt5B7JFGKL.jpg',
   'release_date': '2021-11-03',
   'title': 'Eternals',
   'video': False,
   'vote_average': 7.2,
   'vote_count': 238},
  {'adult': False,
   'backdrop_path': None,
   'genre_ids': [16],
   'id': 375211,
   'original_language': 'en',
   'original_title': 'Marvel Knights: Eternals',
   'overview': "You are thousands of years old. You have amazing powers. You have watched civilizations rise and fall. So why does no one remember any of this? Bes

In [41]:
response = response['results'][0]

In [42]:
response

{'adult': False,
 'backdrop_path': '/3G6wET9eLvYn3aoIj8NfQFhpYEB.jpg',
 'genre_ids': [28, 12, 878],
 'id': 524434,
 'original_language': 'en',
 'original_title': 'Eternals',
 'overview': 'The Eternals are a team of ancient aliens who have been living on Earth in secret for thousands of years. When an unexpected tragedy forces them out of the shadows, they are forced to reunite against mankind’s most ancient enemy, the Deviants.',
 'popularity': 2026.37,
 'poster_path': '/6AdXwFTRTAzggD2QUTt5B7JFGKL.jpg',
 'release_date': '2021-11-03',
 'title': 'Eternals',
 'video': False,
 'vote_average': 7.2,
 'vote_count': 238}

In [15]:
overview = []

In [20]:
overview.append(response['overview'])
# date.append(response['release_date'].split('-')[0])
# genre_ids = response['genre_ids']
# for k in genre_ids:
#     if k in genre_key:
#         g += genre_key[k] +', '
# genres_.append(g)

In [21]:
overview

['The Eternals are a team of ancient aliens who have been living on Earth in secret for thousands of years. When an unexpected tragedy forces them out of the shadows, they are forced to reunite against mankind’s most ancient enemy, the Deviants.']

In [7]:
movies = df
all_movies = list(movies['Title'])

genre_key = {28:'Action', 12:'Adventure', 16:'Animation', 35:'Comedy', 80:'Crime',
             99:'Documentary', 18:'Drama', 10751:'Family', 14:'Fantasy', 36:'History', 
             27:'Horror', 10402:'Music', 9648:'Mystery', 10749:'Romance', 878 :'Science Fiction', 
             10770:'TV Movie', 53:'Thriller', 10752:'War',  37:'Western'}

img_base_url = 'https://image.tmdb.org/t/p/w500'

In [8]:
def show_recommendations(movie_title: str):

    ### SEARCH QUERY IS NOT IN DATAFRAME
    if movie_title not in all_movies:
        g = ''
        search = tmdb.Search()

        response = search.movie(query=movie_title)

        ## SEARCH QUERY NOT FOUND IN TMDB
        if len(response['results']) < 1:
            print('cannot help you')

        else:
            recommendation_data = functions.add_unknown_movie(movie_title, movies)

    ##### SEARCH QUERY IS PRESENT IN DATAFRAME        
    else:
        recommendation_data = movies.copy()

    cosine_similarity_df = functions.cosine_similarities(recommendation_data, 'Genres')

    names = functions.get_recommendations(cosine_similarity_df, movie_title)

    fetched_imgs = []
    fetched_overviews = []
    fetched_ratings = []
    fetched_dates = []
    fetched_genres = []
    
    print(fetched_imgs)

    #API CALL TO GET INFORMATION ON RECOMMENDED MOVIES
    search = tmdb.Search()
    for n in names:
        g = ''
        response = search.movie(query=n)
        response = response['results'][0]

        fetched_overviews.append(response['overview'])

        poster_path = response['poster_path']
        fetched_imgs.append(img_base_url + poster_path)

        fetched_ratings.append(response['vote_average'])

        fetched_dates.append(response['release_date'].split('-')[0])

        genre_ids = response['genre_ids']
        for k in genre_ids:
            if k in genre_key:
                g += genre_key[k] +', '
        fetched_genres.append(g)
    
    return names, fetched_imgs, fetched_dates, fetched_ratings, fetched_overviews, fetched_genres

In [9]:
names, fetched_img_files, fetched_date, fetched_rating, fetched_overview, fetched_genres = show_recommendations('Avatar')

Title  02:22   85         9        21        31   42        99       122  \
Title                                                                      
02:22    0.0  0.0  0.000000  0.000000  0.000000  0.0  0.000000  0.000000   
85       0.0  0.0  0.000000  0.000000  0.000000  0.0  0.000000  0.000000   
9        0.0  0.0  1.000000  0.000000  0.000000  0.0  0.000000  0.000000   
21       0.0  0.0  0.000000  1.000000  0.000000  0.0  0.745622  0.000000   
31       0.0  0.0  0.000000  0.000000  1.000000  0.0  0.000000  1.000000   
42       0.0  0.0  0.000000  0.000000  0.000000  0.0  0.000000  0.000000   
99       0.0  0.0  0.000000  0.745622  0.000000  0.0  1.000000  0.000000   
122      0.0  0.0  0.000000  0.000000  1.000000  0.0  0.000000  1.000000   
300      0.0  0.0  0.000000  0.000000  0.000000  0.0  0.000000  0.000000   
360      0.0  0.0  0.000000  0.000000  0.000000  0.0  0.000000  0.000000   
678      0.0  0.0  0.000000  0.000000  0.000000  0.0  0.000000  0.000000   
730      0.0

In [10]:
names

['Last Knights',
 'Kingsman The Golden Circle',
 'Kong Skull Island',
 'All Is Lost',
 'Kung Fu Panda']

In [11]:
names, fetched_img_files, fetched_date, fetched_rating, fetched_overview, fetched_genres = show_recommendations('Eternals')

Title  02:22   85         9        21        31   42        99       122  \
Title                                                                      
02:22    0.0  0.0  0.000000  0.000000  0.000000  0.0  0.000000  0.000000   
85       0.0  0.0  0.000000  0.000000  0.000000  0.0  0.000000  0.000000   
9        0.0  0.0  1.000000  0.000000  0.000000  0.0  0.000000  0.000000   
21       0.0  0.0  0.000000  1.000000  0.000000  0.0  0.745622  0.000000   
31       0.0  0.0  0.000000  0.000000  1.000000  0.0  0.000000  1.000000   
42       0.0  0.0  0.000000  0.000000  0.000000  0.0  0.000000  0.000000   
99       0.0  0.0  0.000000  0.745622  0.000000  0.0  1.000000  0.000000   
122      0.0  0.0  0.000000  0.000000  1.000000  0.0  0.000000  1.000000   
300      0.0  0.0  0.000000  0.000000  0.000000  0.0  0.000000  0.000000   
360      0.0  0.0  0.000000  0.000000  0.000000  0.0  0.000000  0.000000   
678      0.0  0.0  0.000000  0.000000  0.000000  0.0  0.000000  0.000000   
730      0.0

In [12]:
names

['Last Knights',
 'Kingsman The Golden Circle',
 'Kong Skull Island',
 'All Is Lost',
 'Kung Fu Panda']

In [14]:
names, fetched_img_files, fetched_date, fetched_rating, fetched_overview, fetched_genres = show_recommendations('All The Bright Places')

Title  02:22   85         9        21        31   42        99       122  \
Title                                                                      
02:22    0.0  0.0  0.000000  0.000000  0.000000  0.0  0.000000  0.000000   
85       0.0  0.0  0.000000  0.000000  0.000000  0.0  0.000000  0.000000   
9        0.0  0.0  1.000000  0.000000  0.000000  0.0  0.000000  0.000000   
21       0.0  0.0  0.000000  1.000000  0.000000  0.0  0.745622  0.000000   
31       0.0  0.0  0.000000  0.000000  1.000000  0.0  0.000000  1.000000   
42       0.0  0.0  0.000000  0.000000  0.000000  0.0  0.000000  0.000000   
99       0.0  0.0  0.000000  0.745622  0.000000  0.0  1.000000  0.000000   
122      0.0  0.0  0.000000  0.000000  1.000000  0.0  0.000000  1.000000   
300      0.0  0.0  0.000000  0.000000  0.000000  0.0  0.000000  0.000000   
360      0.0  0.0  0.000000  0.000000  0.000000  0.0  0.000000  0.000000   
678      0.0  0.0  0.000000  0.000000  0.000000  0.0  0.000000  0.000000   
730      0.0

In [15]:
names

['Ghost Town',
 'Swimming With Men',
 'Sweet November',
 'Sweet Home Carolina',
 'Sweet Home Alabama']

In [10]:
fetched_img_files

['https://image.tmdb.org/t/p/w500/lVFYPVDG5hrKYzM4zcDMCMuYyN5.jpg',
 'https://image.tmdb.org/t/p/w500/q6y0Go1tsGEsmtFryDOJo3dEmqu.jpg',
 'https://image.tmdb.org/t/p/w500/wd6mhwDjlpeOufuMUF1FAeMliWZ.jpg',
 'https://image.tmdb.org/t/p/w500/qfhiVU59BSPCdOZwPNs29Z8S23K.jpg',
 'https://image.tmdb.org/t/p/w500/1wY4psJ5NVEhCuOYROwLH2XExM2.jpg']

In [11]:
fetched_date

['2016', '1994', '2007', '2013', '1998']

In [12]:
fetched_rating

[6.6, 8.7, 6.2, 7.4, 8.2]

In [13]:
fetched_overview

["'Wazir' is a tale of two unlikely friends, a wheelchair-bound chess grandmaster and a brave ATS officer. Brought together by grief and a strange twist of fate, the two men decide to help each other win the biggest games of their lives. But there's a mysterious, dangerous opponent lurking in the shadows, who is all set to checkmate them.",
 'Framed in the 1940s for the double murder of his wife and her lover, upstanding banker Andy Dufresne begins a new life at the Shawshank prison, where he puts his accounting skills to work for an amoral warden. During his long stretch in prison, Dufresne comes to be admired by the other inmates -- including an older prisoner named Red -- for his integrity and unquenchable sense of hope.',
 "Jigsaw and his apprentice Amanda are dead. Now, upon the news of Detective Kerry's murder, two seasoned FBI profilers, Agent Strahm and Agent Perez, arrive in the terrified community to assist the veteran Detective Hoffman in sifting through Jigsaw's latest gris

In [None]:
fetched_genres

- remove stop words
- remove non english words

In [6]:
# import re
# new = []
# for t in df['Plots']:
#     new.append(re.sub('[^a-zA-Z0-9]', ' ', t))

- remove symbols
- remove names
- remove stopwords and one-letter words
- stem words

In [5]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [6]:
snow = SnowballStemmer(ignore_stopwords=True, language='english')
stopwords= stopwords.words('english')

In [9]:
# def process_text(texts): 
#     final_text_list = []
#     for sent in texts:
#         # Check if the sentence is a missing value
#         if isinstance(sent, str) == False:
#             sent = ""
            
#         filtered_sentence = []
        
#         for w in word_tokenize(sent):
#             # Check if it is not numeric and its length >2 and not in stop words
#             if(not w.isnumeric()) and (len(w) > 2) and (w not in stopwords):  
#                 # Stem and add to filtered list
#                 filtered_sentence.append(snow.stem(w))
#         final_string = " ".join(filtered_sentence) #final string of cleaned words
 
#         final_text_list.append(final_string)
        
#     return final_text_list

In [10]:
#df['Feature'] = df['Plots'] + df['Genres']

In [12]:
#df['Processed_feature'] = process_text(df['Feature'])

In [7]:
df.columns

Index(['Title', 'Year', 'Genres', 'Plots', 'Processed_feature'], dtype='object')

In [23]:
start = time.perf_counter()
# Instantiate the vectorizer object and transform the plot column
vectorizer = TfidfVectorizer(max_df=0.7, min_df=2)
vectorized_data = vectorizer.fit_transform(df['Processed_feature']) 

# Create Dataframe from TF-IDFarray
tfidf_df = pd.DataFrame(vectorized_data.toarray(), columns=vectorizer.get_feature_names())

# Assign the movie titles to the index and inspect
tfidf_df.index = df['Title']

finish = time.perf_counter()
print(f'Finished in {round(finish-start,2)/60} seconds')

Finished in 0.018666666666666668 seconds


In [15]:
tfidf_df.head()

Unnamed: 0_level_0,100th,10th,11th,12th,13th,14th,150th,1590s,15th,1600s,...,zombi,zone,zoo,zooey,zookeep,zoologist,zoom,zord,zoya,zquez
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
02:22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
import time

In [13]:
start = time.perf_counter()
# Create the array of cosine similarity values
cosine_similarity_array = cosine_similarity(tfidf_df)

# Wrap the array in a pandas DataFrame
cosine_similarity_df = pd.DataFrame(cosine_similarity_array, index=tfidf_df.index, columns=tfidf_df.index)

finish = time.perf_counter()
print(f'Finished in {round(finish-start,2)} seconds')

Finished in 28.53 seconds


In [16]:
# Print the top 5 rows of the DataFrame
cosine_similarity_df.head()

Title,02:22,85,9,10,21,31,42,54,99,122,...,Zoombies,Zoot Suit,Zootopia,Zozo,Zulu,Zulu Man In Japan,The Shawshank Redemption,The Chronicles of Narnia: Prince Caspian,"The Chronicles of Narnia: The Lion, the Witch and the Wardrobe",The Chronicles of Narnia: The Voyage of the Dawn Treader
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
02:22,1.0,0.033892,0.041957,0.0,0.014745,0.010531,0.003392,0.01106,0.004251,0.047789,...,0.048353,0.023003,0.030351,0.050327,0.004362,0.0,0.013648,0.007942,0.0,0.0
85,0.033892,1.0,0.014145,0.012548,0.00291,0.0,0.018701,0.041957,0.002588,0.017491,...,0.008997,0.017032,0.007981,0.0461,0.010583,0.0,0.019578,0.028278,0.013101,0.019855
9,0.041957,0.014145,1.0,0.0,0.0,0.022776,0.011543,0.018445,0.0,0.0,...,0.063413,0.017818,0.079933,0.0,0.0,0.0,0.018854,0.01811,0.023135,0.030482
10,0.0,0.012548,0.0,1.0,0.017473,0.0,0.023065,0.0,0.007941,0.0,...,0.009753,0.0,0.008651,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21,0.014745,0.00291,0.0,0.017473,1.0,0.008302,0.002674,0.002448,0.012788,0.054519,...,0.008582,0.00226,0.01028,0.005994,0.003439,0.0,0.011523,0.0,0.0,0.0


In [8]:
cosine_similarity_df.shape

(15250, 15250)

In [17]:
#cosine_similarity_df.to_csv('data/cosine_similarity.csv', index=False)

In [18]:
# Find the values for the movie rio
cosine_similarity_series = cosine_similarity_df.loc['Thor']

# Sort these values highest to lowest
ordered_similarities = cosine_similarity_series.sort_values(ascending=False)

# Print the results
ordered_similarities

Title
Thor                                           1.000000
Thor The Dark World                            0.318715
While You Were Fighting A Thor Mockumentary    0.280026
Hulk Vs                                        0.260048
Warcraft                                       0.188721
                                                 ...   
Bitter Lake                                    0.000000
Read It And Weep                               0.000000
Reach For Me                                   0.000000
Ray Harryhausen Special Effects Titan          0.000000
Norma Rae                                      0.000000
Name: Thor, Length: 15250, dtype: float64

In [22]:
def cosine_similarities(df, text_col):
    #ransform the feature column
    vectorized_data = vectorizer.fit_transform(df[text_col])
    
    tfidf_df = pd.DataFrame(vectorized_data.toarray(), columns=vectorizer.get_feature_names())

    # Assign the movie titles to the index and inspect
    tfidf_df.index = df['Title']
    
    # Create the array of cosine similarity values
    cosine_similarity_array = cosine_similarity(tfidf_df)

    # Wrap the array in a pandas DataFrame
    cosine_similarity_df = pd.DataFrame(cosine_similarity_array, index=tfidf_df.index, columns=tfidf_df.index)
    
    return cosine_similarity_df

In [21]:
def get_recommendation(cosine_similarity_df, title):
    # Find the values for the movie rio
    cosine_similarity_series = cosine_similarity_df.loc[title]
    
    # Sort these values highest to lowest and select the first 
    ordered_similarities = cosine_similarity_series.sort_values(ascending=False)
    recommendations = list(ordered_similarities[1:6].index)
    
    return recommendations

In [19]:
# Find the values for the movie rio
cosine_similarity_series = cosine_similarity_df.loc['Rio']

# Sort these values highest to lowest
ordered_similarities = cosine_similarity_series.sort_values(ascending=False)
recommendations = list(ordered_similarities[1:6].index)

# Print the results
ordered_similarities

Title
Rio                1.000000
Rio 2              0.495945
Shining Through    0.185474
Delovely           0.180990
Lost In America    0.168870
                     ...   
The Quiet Earth    0.000000
Dealt              0.000000
Psycho Goreman     0.000000
Psycho             0.000000
Doubting Thomas    0.000000
Name: Rio, Length: 15250, dtype: float64

In [20]:
# Find the values for the movie rio
cosine_similarity_series = cosine_similarity_df.loc['The Chronicles of Narnia: Prince Caspian']

# Sort these values highest to lowest
ordered_similarities = cosine_similarity_series.sort_values(ascending=False)

# Print the results
ordered_similarities

Title
The Chronicles of Narnia: Prince Caspian                          1.000000
The Chronicles of Narnia: The Lion, the Witch and the Wardrobe    0.449735
The Chronicles of Narnia: The Voyage of the Dawn Treader          0.266639
Epic Movie                                                        0.163673
The Triumph Of Love                                               0.143035
                                                                    ...   
Harith Iskander I Told You So                                     0.000000
Harlem Nights                                                     0.000000
Harms Way                                                         0.000000
Sundown The Vampire In Retreat                                    0.000000
A New York Christmas Wedding                                      0.000000
Name: The Chronicles of Narnia: Prince Caspian, Length: 15250, dtype: float64

In [21]:
cosine_similarity_df.columns

Index(['02:22', '85', '9', '10', '21', '31', '42', '54', '99', '122',
       ...
       'Zoombies', 'Zoot Suit', 'Zootopia', 'Zozo', 'Zulu',
       'Zulu Man In Japan', 'The Shawshank Redemption',
       'The Chronicles of Narnia: Prince Caspian',
       'The Chronicles of Narnia: The Lion, the Witch and the Wardrobe',
       'The Chronicles of Narnia: The Voyage of the Dawn Treader'],
      dtype='object', name='Title', length=15250)

- cosine similarity is higher with the processed plots, the answers are ot necessarily different, the similarity numbers are just higher

# Without threading 
- 31.49 seconds - jupyter notebook
- 37.01 seconds - on browser

In [43]:
start = time.perf_counter()
x = get_recommendation(cosine_similarity_df, 'Thor')
finish = time.perf_counter()
print(f'Finished in {round(finish-start,2)} seconds')

Finished in 0.03 seconds


In [23]:
start = time.perf_counter()
x = show_recommendations('Thor')
finish = time.perf_counter()
print(f'Finished in {round(finish-start,2)} seconds')

Finished in 31.49 seconds


In [24]:
x

['Thor: The Dark World', 'Team Thor', 'Hulk vs. Thor', 'Warcraft', 'Blackmark']

# Threading

In [26]:
df.shape

(15250, 5)

In [27]:
15250/5

3050.0

In [9]:
3050 * 5

15250

In [41]:
len(df[12200:15250])

3050

In [4]:
def cosine_similarities_thread(df, text_col):
    #transform the feature column
    vectorizer = TfidfVectorizer(max_df=2, min_df=1)
    vectorized_data = vectorizer.fit_transform(df[text_col])
    
    tfidf_df = pd.DataFrame(vectorized_data.toarray(), columns=vectorizer.get_feature_names())
    tfidf_df.index = df['Title']
    
    # Create the array of cosine similarity values
    cosine_similarity_array = cosine_similarity(tfidf_df)

    # Wrap the array in a pandas DataFrame
    cosine_similarity_df = pd.DataFrame(cosine_similarity_array, index=tfidf_df.index, columns=tfidf_df.index)
    
    return cosine_similarity_df

In [16]:
def cosine_similarities_thread1(data, text_col, start, end):
    df = data[start:end]
    #transform the feature column
    vectorizer = TfidfVectorizer(max_df=2, min_df=1)
    vectorized_data = vectorizer.fit_transform(df[text_col])

    tfidf_df = pd.DataFrame(vectorized_data.toarray(), columns=vectorizer.get_feature_names())
    
    tfidf_df.index = df['Title']
    
    # Create the array of cosine similarity values
    cosine_similarity_array = cosine_similarity(tfidf_df)

    # Wrap the array in a pandas DataFrame
    cosine_similarity_df = pd.DataFrame(cosine_similarity_array, index=tfidf_df.index, columns=tfidf_df.index)
    
    return cosine_similarity_df

In [6]:
start = time.time()
x = cosine_similarities_thread(df, 'Processed_feature')
print("--- %s seconds ---" % (time.time() - start))

--- 32.29134798049927 seconds ---


In [7]:
import threading
import time

In [11]:
import concurrent.futures

In [42]:
df.shape

(15250, 5)

In [53]:
len(df)

15250

In [20]:
15250/4

3812.5

In [23]:
3812 * 4

15248

In [52]:
len(d1) + len(d2)+ len(d3) + len(d4) + len(d5) #+ len(d6) + len(d7)

15250

### spliting the data during threading

In [24]:
start = time.time()

with concurrent.futures.ThreadPoolExecutor() as executor:
    t1 = executor.submit(cosine_similarities_thread1, df, 'Processed_feature', 0, 3812)
    t2 = executor.submit(cosine_similarities_thread1, df, 'Processed_feature', 3812, 7624)
    t3 = executor.submit(cosine_similarities_thread1, df, 'Processed_feature', 7624, 11436)
    t4 = executor.submit(cosine_similarities_thread1, df, 'Processed_feature', 11436, 15250)
    
    results = t1.result() +  t2.result() +  t3.result() + t4.result() 
    
print("--- %s seconds ---" % (time.time() - start))

--- 17.773313999176025 seconds ---


In [18]:
start = time.time()

with concurrent.futures.ThreadPoolExecutor() as executor:
    t1 = executor.submit(cosine_similarities_thread1, df, 'Processed_feature', 0, 3050)
    t2 = executor.submit(cosine_similarities_thread1, df, 'Processed_feature', 3050, 6100)
    t3 = executor.submit(cosine_similarities_thread1, df, 'Processed_feature', 6100, 9150)
    t4 = executor.submit(cosine_similarities_thread1, df, 'Processed_feature', 9150, 12200)
    t5 = executor.submit(cosine_similarities_thread1, df, 'Processed_feature', 12200, 15250)
    
    results = t1.result() +  t2.result() +  t3.result() + t4.result() +  t5.result() 
    
print("--- %s seconds ---" % (time.time() - start))

--- 17.48763084411621 seconds ---


In [19]:
start = time.time()

with concurrent.futures.ThreadPoolExecutor() as executor:
    t1 = executor.submit(cosine_similarities_thread1, df, 'Processed_feature', 0, 2178)
    t2 = executor.submit(cosine_similarities_thread1, df, 'Processed_feature', 2178, 4356)
    t3 = executor.submit(cosine_similarities_thread1, df, 'Processed_feature', 4356, 6534)
    t4 = executor.submit(cosine_similarities_thread1, df, 'Processed_feature', 6534, 8712)
    t5 = executor.submit(cosine_similarities_thread1, df, 'Processed_feature', 8712, 10890)
    t6 = executor.submit(cosine_similarities_thread1, df, 'Processed_feature', 10890, 13068)
    t7 = executor.submit(cosine_similarities_thread1, df, 'Processed_feature', 13068, 15250)
    
    results = t1.result() +  t2.result() +  t3.result() + t4.result() +  t5.result() +  t6.result() +  t7.result() 
    
print("--- %s seconds ---" % (time.time() - start))

--- 22.083627939224243 seconds ---


### spliting the data before threading

In [None]:
d1 = df[0:3050]
d2 = df[3050:6100]
d3 = df[6100:9150]
d4 = df[9150:12200]
d5 = df[12200:16250]

In [13]:
start = time.time()

with concurrent.futures.ThreadPoolExecutor() as executor:
    t1 = executor.submit(cosine_similarities_thread, d1, 'Processed_feature')
    t2 = executor.submit(cosine_similarities_thread, d2, 'Processed_feature')
    t3 = executor.submit(cosine_similarities_thread, d3, 'Processed_feature')
    t4 = executor.submit(cosine_similarities_thread, d4, 'Processed_feature')
    t5 = executor.submit(cosine_similarities_thread, d5, 'Processed_feature')
    
    results = t1.result() +  t2.result() +  t3.result() + t4.result() +  t5.result() 
    
print("--- %s seconds ---" % (time.time() - start))

--- 17.922550201416016 seconds ---


In [14]:
d1 = df[0:2178]
d2 = df[2178:4356]
d3 = df[4356:6534]
d4 = df[6534:8712]
d5 = df[8712:10890]
d6 = df[10890:13068]
d7 = df[13068:15250]

In [15]:
start = time.time()

with concurrent.futures.ThreadPoolExecutor() as executor:
    t1 = executor.submit(cosine_similarities_thread, d1, 'Processed_feature')
    t2 = executor.submit(cosine_similarities_thread, d2, 'Processed_feature')
    t3 = executor.submit(cosine_similarities_thread, d3, 'Processed_feature')
    t4 = executor.submit(cosine_similarities_thread, d4, 'Processed_feature')
    t5 = executor.submit(cosine_similarities_thread, d5, 'Processed_feature')
    t6 = executor.submit(cosine_similarities_thread, d6, 'Processed_feature')
    t7 = executor.submit(cosine_similarities_thread, d7, 'Processed_feature')
    
    results = t1.result() +  t2.result() +  t3.result() + t4.result() +  t5.result() +  t6.result() +  t7.result() 
    
print("--- %s seconds ---" % (time.time() - start))

--- 22.21785306930542 seconds ---


In [40]:
results.head()

Title,#Friendbutmarried,#Friendbutmarried 2,#Imomsohard Live,#Lucky Number,#Realityhigh,#Rucker50,#cats_the_mewvie,02:22,1 Night In San Diego,10,...,Zookeeper,Zoolander,Zoolander 2,Zoom,Zoombies,Zoot Suit,Zootopia,Zozo,Zulu,Zulu Man In Japan
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
#Friendbutmarried,,,,,,,,,,,...,,,,,,,,,,
#Friendbutmarried 2,,,,,,,,,,,...,,,,,,,,,,
#Imomsohard Live,,,,,,,,,,,...,,,,,,,,,,
#Lucky Number,,,,,,,,,,,...,,,,,,,,,,
#Realityhigh,,,,,,,,,,,...,,,,,,,,,,


In [41]:
results.shape

(15250, 15250)

In [19]:
def show_recommendations(movie_title: str):
    img_base_url = 'https://image.tmdb.org/t/p/w500'

    cosine_similarity_df = functions.cosine_similarities(df, 'Processed_feature')
    names = functions.get_recommendations(cosine_similarity_df, movie_title)

    fetched_img_files = []
    fetched_titles = []
    fetched_overview = []
    fetched_rating = []
    fetched_date = []

    #API CALL TO GET INFORMATION ON RECOMMENDED MOVIES
    search = tmdb.Search()
    for n in names:
        response = search.movie(query=n)
        response = response['results'][0]
        fetched_titles.append(response['title'])

    return fetched_titles                     

In [20]:
import tmdbsimple as tmdb
import os

api_key = os.getenv("MOVIE_API_KEY")
tmdb.API_KEY = api_key

In [8]:
x = show_recommendations('Avatar')
x

['Stand by Me Doraemon',
 'The Old Guard',
 'Apollo 18',
 "Last Year's Snow Was Falling",
 'Meteor Moon']

In [10]:
search = tmdb.Search()
response = search.movie(query='Stand by Me Doraemon')
response = response['results'][0]
response

{'adult': False,
 'backdrop_path': '/1aABIiqBY7yoQESE8qWvR0w9bJZ.jpg',
 'genre_ids': [16, 10751, 878, 14],
 'id': 265712,
 'original_language': 'ja',
 'original_title': 'STAND BY ME ドラえもん',
 'overview': "In the suburbs of Tokyo some time ago, there lived a clumsy boy about 10 years old. There appeared in front of him named Sewashi, Nobita's descendant of four generations later from the 22nd century, and Doraemon, a 22nd century cat-type caretaker robot who helps people with its secret gadgets. Sewashi claims that his family is suffering from the debts Nobita made even to his generation, so in order to change this disastrous future, he brought along Doraemon as Nobita's caretaker to bring happiness to his future, although Doraemon is not happy about this. And so Sewashi installed an accomplishment program into Doraemon forcing him to take care of Nobita. Unless he makes Nobita happy, Doraemon can no longer go back to the 22nd century. This is how the life of Doraemon and Nobita begins. 

In [11]:
genre_id = [16, 10751, 878, 14]

In [12]:
genre_key = {28:'Action', 12:'Adventure', 16:'Animation', 35:'Comedy', 80:'Crime',
             99:'Documentary', 18:'Drama', 10751:'Family', 14:'Fantasy', 36:'History', 
             27:'Horror', 10402:'Music', 9648:'Mystery', 10749:'Romance', 878 :'Science Fiction', 
             10770:'TV Movie', 53:'Thriller', 10752:'War',  37:'Western'}

In [17]:
genres = []
g = '' 
for k in genre_id:
    if k in genre_key:
        g += genre_key[k] +', '

In [18]:
genre

'Animation, Family, Science Fiction, Fantasy, '

In [24]:
popular_title = []
popular_rating = []
popular_poster = []
popular_date = []
genres = []


search = tmdb.Search()
for n in x:
    g = ''
    response = search.movie(query=n)
    response = response['results'][0]
    popular_rating.append(response['vote_average'])
    popular_date.append(response['release_date'].split('-')[0])
    genre_ids = response['genre_ids']
    for k in genre_ids:
        if k in genre_key:
            g += genre_key[k] +', '
    genres.append(g)

In [25]:
genres

['Animation, Family, Science Fiction, Fantasy, ',
 'Action, Fantasy, ',
 'Horror, Thriller, Science Fiction, ',
 'Animation, Comedy, Family, Fantasy, ',
 'Adventure, Science Fiction, ']