In [8]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [9]:
pip install spacy



In [10]:
df = pd.read_csv("Top_rated_movies1.csv")

In [11]:
df

Unnamed: 0,id,title,overview,popularity,release_date,vote_average,vote_count
0,168705,BloodRayne,"In 18th-century Romania, after spending much o...",17.499,2005-10-22,4.105,501
1,19766,Inspector Gadget 2,"After capturing Claw, all the criminals have g...",20.772,2003-03-11,4.100,342
2,248705,The Visitors: Bastille Day,"Stuck in the corridors of time, Godefroy de Mo...",18.828,2016-03-23,4.090,636
3,17711,The Adventures of Rocky & Bullwinkle,Rocky and Bullwinkle have been living off the ...,16.436,2000-06-30,4.075,335
4,580,Jaws: The Revenge,"After another deadly shark attack, Ellen Brody...",30.996,1987-07-17,4.064,931
...,...,...,...,...,...,...,...
8826,12142,Alone in the Dark,Edward Carnby is a private investigator specia...,12.499,2005-01-28,3.228,550
8827,5491,Battlefield Earth,"In the year 3000, man is no match for the Psyc...",19.133,2000-05-12,3.221,761
8828,11059,House of the Dead,"Set on an island off the coast, a techno rave ...",10.795,2003-04-11,3.100,359
8829,14164,Dragonball Evolution,"On his 18th birthday, Goku receives a mystical...",38.646,2009-03-12,2.891,1925


In [12]:
df.shape

(8831, 7)

In [13]:
df.columns

Index(['id', 'title', 'overview', 'popularity', 'release_date', 'vote_average',
       'vote_count'],
      dtype='object')

In [14]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Define a function to tokenize and preprocess a single review
def preprocess_review(review):
    if pd.isnull(review):  # Check for NaN values
        return []  # Return empty list if review is NaN
    # Tokenize the review into words
    tokens = word_tokenize(review.lower())  # Convert to lowercase

    # Remove punctuation
    tokens = [token for token in tokens if token not in string.punctuation]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return tokens

# Apply the preprocessing function to all reviews in the dataset
df['review_tokens'] = df['overview'].apply(preprocess_review)

# Display the preprocessed reviews
print(df['review_tokens'])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


0       [18th-century, romania, spending, much, life, ...
1       [capturing, claw, criminal, gone, hiding, claw...
2       [stuck, corridor, time, godefroy, de, montmira...
3       [rocky, bullwinkle, living, finance, made, rer...
4       [another, deadly, shark, attack, ellen, brody,...
                              ...                        
8826    [edward, carnby, private, investigator, specia...
8827    [year, 3000, man, match, psychlos, greedy, man...
8828    [set, island, coast, techno, rave, party, attr...
8829    [18th, birthday, goku, receives, mystical, dra...
8830    [platoon, eagle, vulture, attack, resident, sm...
Name: review_tokens, Length: 8831, dtype: object


In [15]:
df

Unnamed: 0,id,title,overview,popularity,release_date,vote_average,vote_count,review_tokens
0,168705,BloodRayne,"In 18th-century Romania, after spending much o...",17.499,2005-10-22,4.105,501,"[18th-century, romania, spending, much, life, ..."
1,19766,Inspector Gadget 2,"After capturing Claw, all the criminals have g...",20.772,2003-03-11,4.100,342,"[capturing, claw, criminal, gone, hiding, claw..."
2,248705,The Visitors: Bastille Day,"Stuck in the corridors of time, Godefroy de Mo...",18.828,2016-03-23,4.090,636,"[stuck, corridor, time, godefroy, de, montmira..."
3,17711,The Adventures of Rocky & Bullwinkle,Rocky and Bullwinkle have been living off the ...,16.436,2000-06-30,4.075,335,"[rocky, bullwinkle, living, finance, made, rer..."
4,580,Jaws: The Revenge,"After another deadly shark attack, Ellen Brody...",30.996,1987-07-17,4.064,931,"[another, deadly, shark, attack, ellen, brody,..."
...,...,...,...,...,...,...,...,...
8826,12142,Alone in the Dark,Edward Carnby is a private investigator specia...,12.499,2005-01-28,3.228,550,"[edward, carnby, private, investigator, specia..."
8827,5491,Battlefield Earth,"In the year 3000, man is no match for the Psyc...",19.133,2000-05-12,3.221,761,"[year, 3000, man, match, psychlos, greedy, man..."
8828,11059,House of the Dead,"Set on an island off the coast, a techno rave ...",10.795,2003-04-11,3.100,359,"[set, island, coast, techno, rave, party, attr..."
8829,14164,Dragonball Evolution,"On his 18th birthday, Goku receives a mystical...",38.646,2009-03-12,2.891,1925,"[18th, birthday, goku, receives, mystical, dra..."


In [16]:
print(df.isnull().sum())
df.dropna(inplace=True)

id               0
title            0
overview         1
popularity       0
release_date     0
vote_average     0
vote_count       0
review_tokens    0
dtype: int64


In [17]:
print(df.isnull().sum())

id               0
title            0
overview         0
popularity       0
release_date     0
vote_average     0
vote_count       0
review_tokens    0
dtype: int64


In [18]:
df

Unnamed: 0,id,title,overview,popularity,release_date,vote_average,vote_count,review_tokens
0,168705,BloodRayne,"In 18th-century Romania, after spending much o...",17.499,2005-10-22,4.105,501,"[18th-century, romania, spending, much, life, ..."
1,19766,Inspector Gadget 2,"After capturing Claw, all the criminals have g...",20.772,2003-03-11,4.100,342,"[capturing, claw, criminal, gone, hiding, claw..."
2,248705,The Visitors: Bastille Day,"Stuck in the corridors of time, Godefroy de Mo...",18.828,2016-03-23,4.090,636,"[stuck, corridor, time, godefroy, de, montmira..."
3,17711,The Adventures of Rocky & Bullwinkle,Rocky and Bullwinkle have been living off the ...,16.436,2000-06-30,4.075,335,"[rocky, bullwinkle, living, finance, made, rer..."
4,580,Jaws: The Revenge,"After another deadly shark attack, Ellen Brody...",30.996,1987-07-17,4.064,931,"[another, deadly, shark, attack, ellen, brody,..."
...,...,...,...,...,...,...,...,...
8826,12142,Alone in the Dark,Edward Carnby is a private investigator specia...,12.499,2005-01-28,3.228,550,"[edward, carnby, private, investigator, specia..."
8827,5491,Battlefield Earth,"In the year 3000, man is no match for the Psyc...",19.133,2000-05-12,3.221,761,"[year, 3000, man, match, psychlos, greedy, man..."
8828,11059,House of the Dead,"Set on an island off the coast, a techno rave ...",10.795,2003-04-11,3.100,359,"[set, island, coast, techno, rave, party, attr..."
8829,14164,Dragonball Evolution,"On his 18th birthday, Goku receives a mystical...",38.646,2009-03-12,2.891,1925,"[18th, birthday, goku, receives, mystical, dra..."


In [19]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert overviews to strings (if they're not already)
df['overview'] = df['overview'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

# Convert overviews to TF-IDF vectors
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['overview'])

In [20]:
from keras.models import Sequential, Model
from keras.layers import Dense, Embedding, Flatten, Input, Reshape
from keras.layers import ReLU
from keras.optimizers import Adam

# Define GAN architecture
def build_generator(latent_dim):
    model = Sequential()
    model.add(Dense(256, input_dim=latent_dim, activation='relu'))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(tfidf_matrix.shape[1], activation='sigmoid'))
    return model

def build_discriminator(input_shape):
    model = Sequential()
    model.add(Dense(512, input_dim=input_shape, activation='relu'))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    return model

# Build and compile the discriminator
discriminator = build_discriminator(tfidf_matrix.shape[1])
discriminator.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

# Build and compile the generator
latent_dim = 100
generator = build_generator(latent_dim)
generator.compile(loss='binary_crossentropy', optimizer=Adam())

# Combined model (stacked generator and discriminator)
discriminator.trainable = False
gan_input = Input(shape=(latent_dim,))
generated_text = generator(gan_input)
gan_output = discriminator(generated_text)
gan = Model(gan_input, gan_output)
gan.compile(loss='binary_crossentropy', optimizer=Adam())

In [21]:
import numpy as np

# Define a function to generate random noise for the generator
def generate_noise(n_samples, latent_dim):
    return np.random.normal(0, 1, (n_samples, latent_dim))

# Define a function to train the GAN model
def train_gan(generator, discriminator, gan, X_train, epochs, batch_size, latent_dim):
    # Define labels for real and fake data
    real_labels = np.ones((batch_size, 1))
    fake_labels = np.zeros((batch_size, 1))

    # Start training
    for epoch in range(epochs):
        # Train discriminator
        idx = np.random.randint(0, X_train.shape[0], batch_size)
        real_data = X_train[idx]
        noise = generate_noise(batch_size, latent_dim)
        fake_data = generator.predict(noise)

        d_loss_real = discriminator.train_on_batch(real_data, real_labels)
        d_loss_fake = discriminator.train_on_batch(fake_data, fake_labels)
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

        # Train generator
        noise = generate_noise(batch_size, latent_dim)
        g_loss = gan.train_on_batch(noise, real_labels)

        # Print progress
        print(f"Epoch {epoch+1}/{epochs} [D loss: {d_loss[0]}, acc.: {100 * d_loss[1]}%] [G loss: {g_loss}]")

# Train the GAN model
epochs = 1000
batch_size = 32
latent_dim = 100

# Convert TF-IDF matrix to numpy array
X_train = tfidf_matrix.toarray()

# Reshape input for generator
X_train = X_train.reshape(X_train.shape[0], -1)

# Normalize input between -1 and 1
X_train = (X_train - 0.5) / 0.5

# Train the GAN
train_gan(generator, discriminator, gan, X_train, epochs, batch_size, latent_dim)

Epoch 1/1000 [D loss: 0.30712634325029126, acc.: 100.0%] [G loss: 37.389862060546875]
Epoch 2/1000 [D loss: 7.041861939480892e-14, acc.: 100.0%] [G loss: 20.721797943115234]
Epoch 3/1000 [D loss: 0.005518619902431965, acc.: 100.0%] [G loss: 52.021507263183594]
Epoch 4/1000 [D loss: 7.225906514129294e-38, acc.: 100.0%] [G loss: 121.97049713134766]
Epoch 5/1000 [D loss: 0.0, acc.: 100.0%] [G loss: 177.3511505126953]
Epoch 6/1000 [D loss: 0.0, acc.: 100.0%] [G loss: 214.01243591308594]
Epoch 7/1000 [D loss: 0.0, acc.: 100.0%] [G loss: 230.67726135253906]
Epoch 8/1000 [D loss: 0.0, acc.: 100.0%] [G loss: 229.18927001953125]
Epoch 9/1000 [D loss: 0.0, acc.: 100.0%] [G loss: 209.65374755859375]
Epoch 10/1000 [D loss: 0.0, acc.: 100.0%] [G loss: 181.0997772216797]
Epoch 11/1000 [D loss: 0.0, acc.: 100.0%] [G loss: 144.24588012695312]
Epoch 12/1000 [D loss: 3.8639095569630354e-38, acc.: 100.0%] [G loss: 109.49986267089844]
Epoch 13/1000 [D loss: 1.0189923205764085e-28, acc.: 100.0%] [G loss: 7

In [22]:
def generate_movie_descriptions(generator, num_samples):
    random_latent_vectors = np.random.normal(size=(num_samples, latent_dim))
    generated_descriptions = generator.predict(random_latent_vectors)
    return generated_descriptions

# 4. Recommend movies
def recommend_movies(generated_descriptions, df, top_n=3):
    # Convert generated descriptions back to text
    # Recommend movies based on the generated descriptions
    # This could involve finding the closest matches in the original dataset
    # using similarity measures like cosine similarity

    # For simplicity, let's just return top_n movies from the dataset based on popularity
    sorted_movies = df.sort_values(by='popularity', ascending=False)
    recommended_movies = sorted_movies.head(top_n)
    return recommended_movies

# Example usage
num_samples = 10
generated_descriptions = generate_movie_descriptions(generator, num_samples)
recommended_movies = recommend_movies(generated_descriptions, df)
print("Recommended movies:")
print(recommended_movies)

Recommended movies:
          id        title                                           overview  \
266   872585  Oppenheimer  The story of J. Robert Oppenheimer's role in t...   
2539  670292  The Creator  Amid a future war between the human race and t...   
2522  670292  The Creator  Amid a future war between the human race and t...   

      popularity release_date  vote_average  vote_count  \
266     1766.305   2023-07-19         8.163        4905   
2539    1622.502   2023-09-27         7.149        1114   
2522    1622.502   2023-09-27         7.156        1128   

                                          review_tokens  
266   [story, j., robert, oppenheimer, 's, role, dev...  
2539  [amid, future, war, human, race, force, artifi...  
2522  [amid, future, war, human, race, force, artifi...  


In [23]:
import pandas as pd

def recommend_movies_by_genre(genre, df, top_n=3):
    # Filter movies by the requested genre
    genre_movies = df[df['overview'].str.lower().str.contains(genre.lower())]

    # Sort genre movies by popularity
    genre_movies = genre_movies.sort_values(by='popularity', ascending=False)

    # Get the top N recommendations
    top_recommendations = genre_movies.head(top_n)

    return top_recommendations[['title', 'overview', 'popularity']]

# Example usage
genre = 'horror'
top_recommendations = recommend_movies_by_genre(genre, df)
print("Top 3 recommended horror movies based on popularity:")
print(top_recommendations)

Top 3 recommended horror movies based on popularity:
                               title  \
8259               Texas Chainsaw 3D   
789   All Quiet on the Western Front   
5517            Annabelle Comes Home   

                                               overview  popularity  
8259  A young woman learns that she has inherited a ...     236.280  
789   Paul Baumer and his friends Albert and Muller,...      89.870  
5517  Determined to keep Annabelle from wreaking mor...      75.538  


In [24]:
def search_movie(generator, partial_overview, num_samples=5):
    # Generate movie descriptions using the GAN model
    generated_descriptions = generate_movie_descriptions(generator, num_samples)

    # Filter the generated descriptions based on the partial overview
    matched_descriptions = []
    for desc in generated_descriptions:
        # Convert each element to a string before joining them
        desc_str = ' '.join([str(item) for item in desc])
        if partial_overview.lower() in desc_str.lower():
            matched_descriptions.append(desc_str)

    # Print the matched descriptions
    print("Generated descriptions matching '{}' in the overview:".format(partial_overview))
    for idx, desc in enumerate(matched_descriptions, start=1):
        print("{}. {}".format(idx, desc))

    if not matched_descriptions:
        print("No matching descriptions found.")

# Example usage
partial_overview = "Stuck in the corridors of time"
num_samples = 10
search_movie(generator, partial_overview, num_samples)


Generated descriptions matching 'Stuck in the corridors of time' in the overview:
No matching descriptions found.


In [25]:
# Function to suggest movie titles based on the first letter
def suggest_movie_titles(df, first_letter):
    suggested_titles = []
    for title in df['title']:
        # Convert the title to lowercase and check the first letter
        if title.lower().startswith(first_letter.lower()):
            suggested_titles.append(title)
    return suggested_titles

# Example usage
first_letter = "san"  # Change this to the desired starting letter
suggested_titles = suggest_movie_titles(df, first_letter)
if suggested_titles:
    print("Suggested movie titles starting with '{}':".format(first_letter))
    for title in suggested_titles:
        print(title)
else:
    print("No movie titles found starting with '{}'. Try another letter.".format(first_letter))

Suggested movie titles starting with 'san':
Sansho the Bailiff
Sanjuro
Santa Claus Is a Stinker
Santa Sangre
Sand Castle
San Andreas
Sanctum
Sandy Wexler
