# K-nearest-neighbors

---

Imported Libraries

In [None]:
# Data processing
# ==================================================================================
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np
import json

# Modify SQL database
# ==================================================================================
import sqlite3

# Preprocessing and modeling
# ==================================================================================
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

from sklearn.metrics.pairwise import cosine_similarity
from pickle import dump


# Warnings Configuration
# ==================================================================================
import warnings

def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn # ignore annoying warning (from sklearn and seaborn)

pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points
'''NOTE: This affects only the display and not the underlying data, which remains unchanged.'''

'NOTE: This affects only the display and not the underlying data, which remains unchanged.'

---

## Step 1: Loading the dataset

In [22]:
# Loading the dataset
df = pd.read_csv("../data/processed/final_table.csv")
df.head(3)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...


---

## Step 2: Study of variables and their content

In [23]:
# Obtain dimensions

rows, columns = df.shape

print(f"The dimensions of this dataset are: {rows} Rows and {columns} Columns")

The dimensions of this dataset are: 4547 Rows and 3 Columns


In [24]:
# Obtain information about data types and non-null values

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4547 entries, 0 to 4546
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4547 non-null   int64 
 1   title     4547 non-null   object
 2   tags      4544 non-null   object
dtypes: int64(1), object(2)
memory usage: 106.7+ KB


In [25]:
# Check null values

null_var = df.isnull().sum().loc[lambda x: x > 0] # Number of nulls in each variable.

num_of_null_var = len(null_var) # Number of variables with almost 1 null.

print(f"{null_var}\n\nThe number of null variables are {num_of_null_var}")

tags    3
dtype: int64

The number of null variables are 1


In [None]:
# Replace NaN with an empty string
df['tags'] = df['tags'].fillna('')

In [27]:
# Check null values

null_var = df.isnull().sum().loc[lambda x: x > 0] # Number of nulls in each variable.

num_of_null_var = len(null_var) # Number of variables with almost 1 null.

print(f"{null_var}\n\nThe number of null variables are {num_of_null_var}")

Series([], dtype: int64)

The number of null variables are 0


In [30]:
# Obtain the number of duplicate values in our dataset

print(f"The number of duplicate values in the dataset is {df.duplicated().sum()}")

The number of duplicate values in the dataset is 0


In [31]:
# Eliminate irrelevant columns

df.drop(['movie_id'],
            axis = "columns",
                inplace = True)

df.head(3)

Unnamed: 0,title,tags
0,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,Spectre,A cryptic message from Bond’s past sends him o...


---

- ### 2.1 Removing spaces and converting the text to lowercase

In [32]:
df["tags"] = df["tags"].str.strip().str.lower()

- ### 2.2 Divide the dataset into train and test

In [33]:
# Train - Test - Split
# ===============================================================================
def split(variable,
           target,
             test_size=0.2,
               random_state=42):
  
  X = df[variable] # Variable
  y = df[target] # Target

  X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                         test_size = test_size,
                                                           random_state = random_state)

  return X_train, X_test, y_train, y_test

In [34]:
X_train, X_test, y_train, y_test = split('title', 'tags')

In [35]:
# Print .shape
# =====================================================================================

print("|X_train|")
print("=================================================================")
print(f"X_train shape: {X_train.shape}\n")

print("|X_test|")
print("=================================================================")
print(f"X_test shape: {X_test.shape}\n")

print("|Y_train|")
print("=================================================================")
print(f"y_train shape: {y_train.shape}\n ")

print("|Y_test|")
print("=================================================================")
print(f"y_test shape: {y_test.shape}\n")

|X_train|
X_train shape: (3637,)

|X_test|
X_test shape: (910,)

|Y_train|
y_train shape: (3637,)
 
|Y_test|
y_test shape: (910,)



In [36]:
# Convert a collection of text documents to a matrix of token counts.

vec_model = CountVectorizer(stop_words = "english") # If ‘english’, a built-in stop word list for English is used

vec_model

In [37]:
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()

In [38]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [44]:
y_train = vec_model.fit_transform(y_train).toarray()
y_test = vec_model.transform(y_test).toarray()

In [45]:
y_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

---

## Step 3: Build KNN

In [46]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()
model.fit(X_train, y_train)

In [47]:
y_pred = model.predict(X_test)
y_pred

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [48]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

ValueError: multiclass-multioutput is not supported