# Movie Recommendation System

## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

## Defined Functions

In [2]:
def strToDict(string):
    
    string = re.split('{|}|, \'', string)
    ans = {}
    for i in string:
        if i != '' and i != '[' and i != ']':
            i = re.split(':|[:| ]+', i)
            if i[0][0] == "'":
                key = i[0][1:len(i[0])-1]
            else:
                key = i[0][:len(i[0])-1]
            ans[key] = i[2]
            if len(i) > 3:
                for j in i[3:]:
                    ans[key] += " " + j
            if ans[key].isdigit():
                ans[key] = int(ans[key])
            elif ans[key][0] == "'":
                ans[key] = ans[key][1:len(ans[key])-1]
                if ans[key].lower() == 'None'.lower():
                    ans[key] = None

    return ans

In [3]:
def listToText(li):
    if type(li) == list:
        ans = li[0]['name']
        for i in li[1:]:
            ans += ',' + i['name']
    else:
        ans = ''
    
    return ans

In [4]:
# Generating Feature for Machine Learning
def generateFeatures(moviesMetadata, keywords, metadataColumns, keywordsColumns):
    moviesMetadataFeatureColumns = [metadataColumns[5], metadataColumns[20], metadataColumns[9], metadataColumns[3]]
    moviesMetadataFeature = moviesMetadata[moviesMetadataFeatureColumns]

    keywordsFeature = pd.merge(moviesMetadataFeature, keywords, on = keywordsColumns[0])
    keywordsFeatureColumns = keywordsFeature.columns
    
    keywordsFeature[keywordsFeatureColumns[3]] = keywordsFeature[keywordsFeatureColumns[3]].apply(listToText)
    keywordsFeature[keywordsFeatureColumns[4]] = keywordsFeature[keywordsFeatureColumns[4]].apply(listToText)

    keywordsFeature['Tags'] = keywordsFeature[keywordsFeatureColumns[2]] + keywordsFeature[keywordsFeatureColumns[3]] + keywordsFeature[keywordsFeatureColumns[4]]
    rowsToDrop = keywordsFeature.loc[keywordsFeature['Tags'] == '', keywordsFeatureColumns[0]]
    
    keywordsFeature.drop(columns = keywordsFeatureColumns[2:], inplace = True)
    keywordsFeature.drop(index = rowsToDrop.index, inplace = True)
    keywordsFeature.reset_index(inplace = True)
    return keywordsFeature, rowsToDrop

## Loading and Cleaning Dataset

In [5]:
pd.options.mode.chained_assignment = None

def loadData():
    # Datasets and their Storage Path
    csvPath = 'D:\Machine Learning and Data Science\Projects\Movie Recommendation System\Dataset'
    csvFiles = ['\movies_metadata.csv', '\keywords.csv']

    # Defining Columns and Data Types
    metadataColumns = ["Adult", "Collection", "Budget", "Genres", "Homepage", "ID", "IMDB ID", "Original Language", "Original Title", "Overview", "Popularity", "Poster Path", "Production Companies", "Production Countries", "Release Date", "Revenue", "Runtime", "Spoken Languages", "Status", "Tagline", "Title", "Video", "Vote Average", "Vote Count"]
    keywordsColumns = ['ID', 'Keywords']
    
    # Opening CSV files as Pandas Data Frame
    moviesMetadata = pd.read_csv(csvPath + csvFiles[0], header = 0, names = metadataColumns, low_memory = False, nrows = 25000)
    keywords = pd.read_csv(csvPath + csvFiles[1], header = 0, names = keywordsColumns)
    
    # Cleaning and Preparing Data Frame
    # Cleaning Movies Metadata Data Frame
    moviesMetadata.fillna('' ,inplace = True)
    
    if type(moviesMetadata[metadataColumns[0]][0]) == str:
        unwantedRowsIndex = moviesMetadata[np.logical_and(moviesMetadata[metadataColumns[0]].str.upper() != 'FALSE', moviesMetadata[metadataColumns[0]].str.upper() != 'TRUE')].index
        moviesMetadata.drop(index = unwantedRowsIndex, inplace = True)
        moviesMetadata.reset_index(inplace = True)
        moviesMetadata[metadataColumns[0]] = [ele.lower().capitalize() == "True" for ele in moviesMetadata[metadataColumns[0]]]
    

    for i in range(moviesMetadata.shape[0]):
        if moviesMetadata[metadataColumns[1]][i] != '':
            moviesMetadata[metadataColumns[1]][i] = strToDict(moviesMetadata[metadataColumns[1]][i])

    moviesMetadata[metadataColumns[2]] = moviesMetadata[metadataColumns[2]].apply(int)

    for i in range(moviesMetadata.shape[0]):
        if moviesMetadata[metadataColumns[3]][i] != '' and moviesMetadata[metadataColumns[3]][i] != '[]':
            y = re.split('}, {',moviesMetadata[metadataColumns[3]][i])
            temp = []
            for j in y:
                if j != '':
                    ans = strToDict(j)
                    temp.append(ans)
            moviesMetadata[metadataColumns[3]][i] = temp

    moviesMetadata[metadataColumns[5]] = moviesMetadata[metadataColumns[5]].apply(int)

    unwantedRowsIndex = moviesMetadata[moviesMetadata[metadataColumns[10]] == ''].index
    moviesMetadata.drop(index = unwantedRowsIndex, inplace = True)
    moviesMetadata.reset_index(inplace = True)
    moviesMetadata[metadataColumns[10]] = moviesMetadata[metadataColumns[10]].apply(float)

    for i in range(moviesMetadata.shape[0]):
        if moviesMetadata[metadataColumns[12]][i] != '' and moviesMetadata[metadataColumns[12]][i] != '[]':
            y = re.split('}, {',moviesMetadata[metadataColumns[12]][i])
            temp = []
            for j in y:
                if j != '':
                    ans = strToDict(j)
                    temp.append(ans)
            moviesMetadata[metadataColumns[12]][i] = temp

    for i in range(moviesMetadata.shape[0]):
        if moviesMetadata[metadataColumns[13]][i] != '' and moviesMetadata[metadataColumns[13]][i] != '[]':
            y = re.split('}, {',moviesMetadata[metadataColumns[13]][i])
            temp = []
            for j in y:
                if j != '':
                    ans = strToDict(j)
                    temp.append(ans)
            moviesMetadata[metadataColumns[13]][i] = temp

    moviesMetadata[metadataColumns[14]] = pd.to_datetime(moviesMetadata[metadataColumns[14]], format='%Y-%m-%d')
    moviesMetadata[metadataColumns[14]] = moviesMetadata[metadataColumns[14]].apply(lambda x: x.date())

    moviesMetadata[metadataColumns[15]] = moviesMetadata[metadataColumns[15]].apply(float)

    moviesMetadata[metadataColumns[16]].fillna('None', inplace = True) 

    for i in range(moviesMetadata.shape[0]):
        if moviesMetadata[metadataColumns[17]][i] != '' and moviesMetadata[metadataColumns[17]][i] != '[]':
            y = re.split('}, {',moviesMetadata[metadataColumns[17]][i])
            temp = []
            for j in y:
                if j != '':
                    ans = strToDict(j)
                    temp.append(ans)
            moviesMetadata[metadataColumns[17]][i] = temp

    moviesMetadata[metadataColumns[22]] = moviesMetadata[metadataColumns[22]].apply(float)
    moviesMetadata[metadataColumns[23]] = moviesMetadata[metadataColumns[23]].apply(int)

    # Cleaning Keywords Data Frame
    
    keywords = keywords[keywords[keywordsColumns[0]].isin(moviesMetadata[metadataColumns[5]])]   
    keywords.reset_index(inplace = True)
    
    for i in range(keywords.shape[0]):
        if keywords[keywordsColumns[1]][i] != '' and keywords[keywordsColumns[1]][i] != '[]':
            y = re.split('}, {',keywords[keywordsColumns[1]][i])
            temp = []
            for j in y:
                if j != '':
                    ans = strToDict(j)
                    temp.append(ans)
            keywords[keywordsColumns[1]][i] = temp
    return moviesMetadata.copy(), keywords.copy(), metadataColumns, keywordsColumns

## Making Model

In [6]:
moviesMetadata, keywords, metadataColumns, keywordsColumns= loadData()

In [7]:
keyFeature, rowsToDrop = generateFeatures(moviesMetadata, keywords, metadataColumns, keywordsColumns)

In [8]:
keyFeatureColumns = keyFeature.columns

In [9]:
moviesMetadata.drop(index = rowsToDrop.index, inplace = True)
moviesMetadata.drop(columns = 'level_0', inplace = True)
moviesMetadata.reset_index(inplace = True)
moviesMetadata.drop(columns = ['level_0','index'], inplace = True)

keywords.drop(index = rowsToDrop.index, inplace = True)
keywords.reset_index(inplace = True)
keywords.drop(columns = ['level_0','index'], inplace = True)

### Initializing and Fiting Model

In [10]:
model = CountVectorizer(max_features = keyFeature.shape[0], stop_words = 'english')

In [11]:
vector = model.fit_transform(keyFeature[keyFeatureColumns[3]].values.astype('U')).toarray()

### Creating Cosine Similarity Matrix

In [12]:
similarVector = cosine_similarity(vector)

## Dumping Processed Data

In [13]:
dataDumpFolder = 'D:\Machine Learning and Data Science\Projects\Movie Recommendation System\Processed Data'

dataDumpFiles = ['\keyFeature.pkl', '\similarVector.pkl', '\moviesMetadataProcessed.pkl', '\keywordsProcessed.pkl']

In [14]:
pickle.dump(keyFeature, open(dataDumpFolder + dataDumpFiles[0], 'wb'))
pickle.dump(similarVector, open(dataDumpFolder + dataDumpFiles[1], 'wb'))
pickle.dump(moviesMetadata, open(dataDumpFolder + dataDumpFiles[2], 'wb'))
pickle.dump(keywords, open(dataDumpFolder + dataDumpFiles[3], 'wb'))