<a href="https://colab.research.google.com/github/DIVIJGOYAL7080/Google-collab/blob/main/content_based_filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# installing kaggle module
!pip install kaggle



In [None]:
# uploading kaggle.json file
from google.colab import files
files.upload()

In [None]:
# creating a new hidden directory named 'kaggle'
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# downloading dataset
!kaggle datasets download -d tmdb/tmdb-movie-metadata

In [None]:
# listing all the files in working directory
!ls

In [None]:
# unzipping
!unzip tmdb-movie-metadata.zip

In [None]:
# you can download the same files from github repo as well
# uncomment the below command if you are unable to download the files from kaggle.
# !git clone https://github.com/procodingclass/kaggle-movie-data.git

In [None]:
# creating dataframe
import pandas as pd
df1 = pd.read_csv('tmdb_5000_credits.csv')
df2 = pd.read_csv('tmdb_5000_movies.csv')

In [None]:
# printing first 5 rows
df1.head()

In [None]:
# printing first 5 rows
df2.head()

In [None]:
df1.columns = ['id','tittle','cast','crew']
df2= df2.merge(df1,on='id')


df2.head(5)

In [None]:
C = df2['vote_average'].mean()
print(C)

In [None]:
m = df2['vote_count'].quantile(0.9)
print(m)

In [None]:
q_movies = df2.copy().loc[df2['vote_count'] >= m]
print(q_movies.shape)

In [None]:
def weighted_rating(x, m=m, C=C):
   v = x['vote_count']
   R = x['vote_average']
   return (v/(v+m) * R) + (m/(m+v) * C)


q_movies['score'] = q_movies.apply(weighted_rating, axis=1)


In [None]:
q_movies = q_movies.sort_values('score', ascending=False)
q_movies[['original_title', 'vote_count', 'vote_average', 'score']].head(10)

In [None]:
import plotly.express as px


fig = px.bar((q_movies.head(10).sort_values('score', ascending=True)), x="score", y="original_title", orientation='h')
fig.show()


In [None]:
df2[['title', 'cast', 'crew', 'keywords', 'genres']].head(3)

In [None]:
from ast import literal_eval


features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
   df2[feature] = df2[feature].apply(literal_eval)


df2.dtypes


In [None]:
import numpy as np

def get_director(x):
   for i in x:
       if i['job'] == 'Director':
           return i['name']
   return np.nan


df2['director'] = df2['crew'].apply(get_director)

In [None]:
def get_list(x):
   if isinstance(x, list):
       names = [i['name'] for i in x]
       return names
   return []


features = ['cast', 'keywords', 'genres']
for feature in features:
   df2[feature] = df2[feature].apply(get_list)


In [None]:
df2[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

In [None]:
def clean_data(x):
   if isinstance(x, list):
       return [str.lower(i.replace(" ", "")) for i in x]
   else:
       if isinstance(x, str):
           return str.lower(x.replace(" ", ""))
       else:
           return ''


features = ['cast', 'keywords', 'director', 'genres']
for feature in features:
   df2[feature] = df2[feature].apply(clean_data)

In [None]:
def create_soup(x):
   return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
df2['soup'] = df2.apply(create_soup, axis=1)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df2['soup'])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [None]:
df2 = df2.reset_index()
indices = pd.Series(df2.index, index=df2['title'])

In [None]:
def get_recommendations(title, cosine_sim):
   idx = indices[title]
   sim_scores = list(enumerate(cosine_sim[idx]))
   sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
   sim_scores = sim_scores[1:11]
   movie_indices = [i[0] for i in sim_scores]
   return df2['title'].iloc[movie_indices]

In [None]:
get_recommendations('Fight Club', cosine_sim2)

In [None]:
get_recommendations('The Shawshank Redemption', cosine_sim2)

In [None]:
get_recommendations('The Godfather', cosine_sim2)