In [316]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from spacy.lang.en import English
import numpy as np
from sklearn.metrics import precision_score, recall_score
 

In [317]:
movies=pd.read_csv('data.csv')


In [318]:
new_movies=movies[['id', 'title', 'genre', 'overview', ]]
df = pd.DataFrame(new_movies)
df

Unnamed: 0,id,title,genre,overview
0,278,The Shawshank Redemption,"Drama,Crime",Framed in the 1940s for the double murder of h...
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance","Raj is a rich, carefree, happy-go-lucky second..."
2,238,The Godfather,"Drama,Crime","Spanning the years 1945 to 1955, a chronicle o..."
3,424,Schindler's List,"Drama,History,War",The true story of how businessman Oskar Schind...
4,240,The Godfather: Part II,"Drama,Crime",In the continuing saga of the Corleone crime f...
...,...,...,...,...
9995,10196,The Last Airbender,"Action,Adventure,Fantasy","The story follows the adventures of Aang, a yo..."
9996,331446,Sharknado 3: Oh Hell No!,"Action,TV Movie,Science Fiction,Comedy,Adventure",The sharks take bite out of the East Coast whe...
9997,13995,Captain America,"Action,Science Fiction,War","During World War II, a brave, patriotic Americ..."
9998,2312,In the Name of the King: A Dungeon Siege Tale,"Adventure,Fantasy,Action,Drama",A man named Farmer sets out to rescue his kidn...


In [319]:
df['tag'] = df['genre'].str.cat(df['overview'], sep=' ')
df

Unnamed: 0,id,title,genre,overview,tag
0,278,The Shawshank Redemption,"Drama,Crime",Framed in the 1940s for the double murder of h...,"Drama,Crime Framed in the 1940s for the double..."
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance","Raj is a rich, carefree, happy-go-lucky second...","Comedy,Drama,Romance Raj is a rich, carefree, ..."
2,238,The Godfather,"Drama,Crime","Spanning the years 1945 to 1955, a chronicle o...","Drama,Crime Spanning the years 1945 to 1955, a..."
3,424,Schindler's List,"Drama,History,War",The true story of how businessman Oskar Schind...,"Drama,History,War The true story of how busine..."
4,240,The Godfather: Part II,"Drama,Crime",In the continuing saga of the Corleone crime f...,"Drama,Crime In the continuing saga of the Corl..."
...,...,...,...,...,...
9995,10196,The Last Airbender,"Action,Adventure,Fantasy","The story follows the adventures of Aang, a yo...","Action,Adventure,Fantasy The story follows the..."
9996,331446,Sharknado 3: Oh Hell No!,"Action,TV Movie,Science Fiction,Comedy,Adventure",The sharks take bite out of the East Coast whe...,"Action,TV Movie,Science Fiction,Comedy,Adventu..."
9997,13995,Captain America,"Action,Science Fiction,War","During World War II, a brave, patriotic Americ...","Action,Science Fiction,War During World War II..."
9998,2312,In the Name of the King: A Dungeon Siege Tale,"Adventure,Fantasy,Action,Drama",A man named Farmer sets out to rescue his kidn...,"Adventure,Fantasy,Action,Drama A man named Far..."


In [320]:
df = df.drop(columns=['genre', 'overview'])
df = df.dropna(subset=['tag'])
df.reset_index(drop=True)
df['index'] = range(len(df))
df['tag']=df['tag'].str.lower()
df['title']=df['title'].str.lower()

In [321]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('english')

# Initialize SpaCy
nlp = spacy.load('en_core_web_sm')


# Check the DataFrame columns
print("DataFrame columns:", df.columns)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dudda\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dudda\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\dudda\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


DataFrame columns: Index(['id', 'title', 'tag', 'index'], dtype='object')


In [322]:
# NLTK tokenization, stemming, and lemmatization using lambda functions
lemmatizer = nltk.WordNetLemmatizer()
stemmer = PorterStemmer()

nltk_tokenize_stem_and_lemmatize = lambda text: [
    (token, stemmer.stem(token), lemmatizer.lemmatize(token)) for token in word_tokenize(text)
]

In [323]:
# SpaCy tokenization and lemmatization using lambda functions
spacy_tokenize_and_lemmatize = lambda text: [
    (token.text, token.lemma_) for token in nlp(text)
]


In [324]:
# Apply tokenization, stemming, and lemmatization to DataFrame
df['nltk_processed'] = df['tag'].apply(nltk_tokenize_stem_and_lemmatize)  


In [325]:
# # Print results
# print("Tokenization, Stemming, and Lemmatization using NLTK:")
# for _, row in df.iterrows():
#     print(f"Original: {row['tag']}")  
#     print(f"NLTK Processed: {row['nltk_processed']}\n")

# print("Tokenization and Lemmatization using SpaCy:")
# for _, row in df.iterrows():
#     print(f"Original: {row['tag']}")  
#     print(f"SpaCy Processed: {row['spacy_processed']}\n")

In [326]:
# Split the data into train and test sets
train_df, test_df, train_index, test_index = train_test_split(df, df.index, test_size=0.2, random_state=42)

In [327]:
vectorizer = TfidfVectorizer()
tfidf_matrix_train = vectorizer.fit_transform(train_df['tag'])
tfidf_matrix_test = vectorizer.transform(test_df['tag'])
# tfidf_matrix_test

In [328]:
# Initialize the k-NN model
k = 5  # Number of neighbors to use
knn_model = NearestNeighbors(n_neighbors=k, metric='cosine')

# Fit the model on the training data
knn_model.fit(tfidf_matrix_train)

# Function to get k-nearest neighbors
def get_k_nearest_neighbors(model, query_vector):
    distances, indices = model.kneighbors(query_vector)
    return indices[0]


In [329]:
# Function to recommend movies
def recommend_movies(movie_title, k):
    if movie_title not in df['title'].values:
        raise ValueError(f"Movie '{movie_title}' not found in the dataset.")
    
    # Find the index of the movie in the dataset
    movie_index = df[df['title'] == movie_title].index[0]
    
    # Get the TF-IDF vector for the movie
    query_vector = vectorizer.transform([df.iloc[movie_index]['tag']])
    
    
    # Get k-nearest neighbors
    neighbor_indices = get_k_nearest_neighbors(knn_model, query_vector)
    
    # Get recommended movie titles
    recommended_movies = df.iloc[neighbor_indices]['title']
    return recommended_movies

In [338]:
try:
    movie_title = 'dilwale dulhania le jayenge'
    recommended_movies = recommend_movies(movie_title, k)
    print("Recommended movies:\n", recommended_movies)
except ValueError as e:
    print(e)


Recommended movies:
 6512           city of the living dead
2233                   victor/victoria
4170                 we're the millers
2265    planes, trains and automobiles
731                 a walk to remember
Name: title, dtype: object
