### Doc

This module of the system provides a similarity comparison between two movies in terms of the characters. Given two movies, the characters along with their prominent traits are fetched from the Word Vector file. Based on the word vector for each character, the cosine distance is estimated between the characters from both movies. Then for each character of the first movie, the most similar character from the second is provided. <br>

##### Input

Enter two movie names

##### Output

n x 2 array of characters along with their equivalent characters from movie 2 where n is the number of characters with traits.

In [1]:
import pandas as pd
import numpy as np
import math
from typing import List


In [2]:
Vector = List[float]
def vector_len(v: Vector) -> float:
    return math.sqrt(sum([x*x for x in v]))

def dot_product(v1: Vector, v2: Vector) -> float:
    assert len(v1) == len(v2)
    return sum([x*y for (x,y) in zip(v1, v2)])

def cosine_similarity(v1: Vector, v2: Vector) -> float:
    """
    Returns the cosine of the angle between the two vectors.
    Results range from -1 (very different) to 1 (very similar).
    """
    return dot_product(v1, v2) / (vector_len(v1) * vector_len(v2))

def replace1(row):
    val = row['WordVec']
    if val == '0':
        paddingwordvec = [0 for i in range(200)]

        val = paddingwordvec
    else:
        z = val.split(' ')
        final_vec1 = []
        for i in z:
            if i != '':
                final_vec1.append(float(i))
                val = final_vec1
    return val



In [3]:
class character_similarity():
    def __init__(self,moviename1, moviename2):
        self.moviename1 = moviename1
        self.moviename2 = moviename2
        data = pd.read_csv("../Bollywood-Data-master/wikipedia-data/avg_wv_relation.csv") 
        
        
        #Remove NaN values
        data1 = data.dropna()
        
        #Remove entries that do not have character names
        somex = data1.Char.tolist()
        for i,j in enumerate(somex):
            z = j.split('_')
            if len(z) == 1:
                val = i
        data1 = data1.drop(data1.index[[val]])

        
        # Create two more columns
        data1['Movie'] = data1['Char'].apply(lambda s:s.split('_')[1])
        data1['Character'] = data1['Char'].apply(lambda s:s.split('_')[0])
        data1['WordVec'] = data1.apply(replace1, axis = 1)
        data1 = data1.reset_index()
        self.datamovie1 = data1[data1.Movie == moviename1]
        self.datamovie2 = data1[data1.Movie == moviename2]
        
    def compute_similarity(self):
        datamovie1 = self.datamovie1
        datamovie2 = self.datamovie2
        matrixx = []
        for i in range(len(datamovie1)):
            val = np.array(datamovie1.iloc[i].WordVec)
            lst = np.array(datamovie2.WordVec.tolist())
            retv = list([cosine_similarity(val, i) for i in lst])
            name = [datamovie1.iloc[i]['Character']]
            retv = retv + name 
            matrixx.append(retv)
        matrixx = np.matrix(matrixx)
        movie2names = datamovie2.Character.tolist()
        dataf = pd.DataFrame(matrixx, columns = movie2names+['names'])
        dataf = dataf.set_index('names')
        dataf = dataf[movie2names].astype(float)
        return dataf.idxmax(axis = 1)



In [4]:
x = character_similarity('Khilona', 'Geet')
y = x.compute_similarity().to_dict()
y

{'Radha': 'Dindayal',
 'Bihari': 'Sarju',
 'Chand': 'Janki',
 'Singh': 'Sarju',
 'Vijaykamal': 'Sarju'}