# K-nearest-neighbors

---

Imported Libraries

In [None]:
# Data processing
# ==================================================================================
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np
import json

# Modify SQL database
# ==================================================================================
import sqlite3

# Preprocessing and modeling
# ==================================================================================
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from pickle import dump


# Warnings Configuration
# ==================================================================================
import warnings

def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn # ignore annoying warning (from sklearn and seaborn)

pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points
'''NOTE: This affects only the display and not the underlying data, which remains unchanged.'''

'NOTE: This affects only the display and not the underlying data, which remains unchanged.'

---

## Step 1: Loading the dataset

In [2]:
# Loading the dataset
df = pd.read_csv("../data/processed/final_table.csv")
df.head(3)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...


---

## Step 2: Study of variables and their content

In [3]:
# Obtain dimensions

rows, columns = df.shape

print(f"The dimensions of this dataset are: {rows} Rows and {columns} Columns")

The dimensions of this dataset are: 4547 Rows and 3 Columns


In [4]:
# Obtain information about data types and non-null values

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4547 entries, 0 to 4546
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4547 non-null   int64 
 1   title     4547 non-null   object
 2   tags      4544 non-null   object
dtypes: int64(1), object(2)
memory usage: 106.7+ KB


In [5]:
# Check null values

null_var = df.isnull().sum().loc[lambda x: x > 0] # Number of nulls in each variable.

num_of_null_var = len(null_var) # Number of variables with almost 1 null.

print(f"{null_var}\n\nThe number of null variables are {num_of_null_var}")

tags    3
dtype: int64

The number of null variables are 1


In [6]:
# Replace NaN with an empty string
df['tags'] = df['tags'].fillna('')

In [7]:
# Check null values

null_var = df.isnull().sum().loc[lambda x: x > 0] # Number of nulls in each variable.

num_of_null_var = len(null_var) # Number of variables with almost 1 null.

print(f"{null_var}\n\nThe number of null variables are {num_of_null_var}")

Series([], dtype: int64)

The number of null variables are 0


In [8]:
# Obtain the number of duplicate values in our dataset

print(f"The number of duplicate values in the dataset is {df.duplicated().sum()}")

The number of duplicate values in the dataset is 0


In [9]:
# Eliminate irrelevant columns

df.drop(['movie_id'],
            axis = "columns",
                inplace = True)

df.head(3)

Unnamed: 0,title,tags
0,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,Spectre,A cryptic message from Bond’s past sends him o...


---

## Step 3: Build KNN

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['tags'])

In [None]:
model = NearestNeighbors(n_neighbors = 6, metric = 'cosine')
model.fit(X)

In [None]:
similarity = cosine_similarity(X)

def recommend(movie):
    movie_index = df[df["title"] == movie].index[0]
    distances = similarity[movie_index]
    movie_list = sorted(list(enumerate(distances)), reverse = True , key = lambda x: x[1])[1:6]
    
    for i in movie_list:
        print(df.iloc[i[0]].title)

In [19]:
recommend('The Shawshank Redemption')

Civil Brand
Prison
Penitentiary
Escape Plan
Fortress
