In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#getting the data
movies_dataset = pd.read_csv("https://raw.githubusercontent.com/BheemisettySaiHarsha/PRML_project/main/PRML%20DATASET/movies.csv")
movies_dataset.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
# getting the ratings dataset
ratings_dataset = pd.read_csv("https://raw.githubusercontent.com/BheemisettySaiHarsha/PRML_project/main/PRML%20DATASET/ratings.csv")
ratings_dataset.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
# getting the length of the dataset
no_of_movies=len(movies_dataset)
no_of_ratings=len(ratings_dataset)
print("Total No of Movies: ",no_of_movies)
print("Total No of Ratings: ",no_of_ratings)

Total No of Movies:  9742
Total No of Ratings:  100836


In [5]:
#deleting the unnecessary columns
ratings_dataset=ratings_dataset.drop('timestamp',axis=1)
ratings_dataset.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [6]:
#merging both dataframes
new_dataset=pd.merge(ratings_dataset,movies_dataset,on='movieId')
new_dataset.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [7]:
#converting genres into diff columns
genre = new_dataset.genres
X=[]
for i in genre:
  genre_string = i
  X.append(genre_string.split('|'))

genre_df=pd.DataFrame(X)
new_genre=genre_df[0].unique()
for i in new_genre:
  new_dataset[i]=0

#printing the new dataset
new_dataset.head()

Unnamed: 0,userId,movieId,rating,title,genres,Adventure,Comedy,Action,Mystery,Crime,...,Horror,Documentary,Sci-Fi,Fantasy,Film-Noir,Western,Musical,Romance,(no genres listed),War
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,7,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,15,1,2.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,17,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
#assign 1 to all columns which are present in the Genres
for i in new_genre:
  new_dataset.loc[new_dataset['genres'].str.contains(i), i] = 1

# also we drop the genres and title column as it is of no use
new_dataset=new_dataset.drop(['genres','title'],axis=1)
new_dataset.head()

  new_dataset.loc[new_dataset['genres'].str.contains(i), i] = 1


Unnamed: 0,userId,movieId,rating,Adventure,Comedy,Action,Mystery,Crime,Thriller,Drama,...,Horror,Documentary,Sci-Fi,Fantasy,Film-Noir,Western,Musical,Romance,(no genres listed),War
0,1,1,4.0,1,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,5,1,4.0,1,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,7,1,4.5,1,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,15,1,2.5,1,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,17,1,4.5,1,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [9]:
#there will be some null values in the new_dataset we simply drop them
new_dataset.dropna(inplace=True)
new_dataset.isnull().sum()

userId                0
movieId               0
rating                0
Adventure             0
Comedy                0
Action                0
Mystery               0
Crime                 0
Thriller              0
Drama                 0
Animation             0
Children              0
Horror                0
Documentary           0
Sci-Fi                0
Fantasy               0
Film-Noir             0
Western               0
Musical               0
Romance               0
(no genres listed)    0
War                   0
dtype: int64

In [10]:
from sklearn.cluster import KMeans
from sklearn import metrics
kmeanModel = KMeans(n_clusters=9)
kmeanModel.fit(new_dataset)



In [11]:
import pickle
with open('model_pickle','wb') as file:
    pickle.dump(kmeanModel,file)

In [12]:
with open('model_pickle','rb') as file:
    mp = pickle.load(file)

In [13]:
# Creating an extra column in data for storing the cluster values
new_dataset['Cluster'] = mp.labels_
new_dataset['Cluster'].sample(n=10)

32924    0
7904     0
32676    0
14598    0
85159    0
97413    7
59935    0
96258    2
34758    0
15369    0
Name: Cluster, dtype: int32

In [14]:
new_dataset['Cluster'].value_counts()

Cluster
0    54878
7    23061
2     6396
4     4070
6     3810
1     3278
8     2980
3     1297
5     1066
Name: count, dtype: int64

In [15]:
new_dataset.head()

Unnamed: 0,userId,movieId,rating,Adventure,Comedy,Action,Mystery,Crime,Thriller,Drama,...,Documentary,Sci-Fi,Fantasy,Film-Noir,Western,Musical,Romance,(no genres listed),War,Cluster
0,1,1,4.0,1,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,5,1,4.0,1,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,7,1,4.5,1,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,15,1,2.5,1,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,17,1,4.5,1,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [16]:
# When we merged the dataframe  then for a single movie multiple rows were created
# so a single movie is allotted to  so many clusters
# so here we will give a single cluster to a movie
# the Cluster which occurs maximum number of times is alloted to the movie
new_dataframe = []
def cluster_reallocation(group_of_data):
  x = pd.DataFrame(group_of_data)
  y = pd.DataFrame(x['Cluster'].value_counts())
  z = x.index
  w = [x['movieId'][z[0]],int(y.idxmax())]
  new_dataframe.append(w)

new_dataset.groupby("movieId").apply(lambda x: cluster_reallocation(x))

  w = [x['movieId'][z[0]],int(y.idxmax())]


In [17]:
new_dataset

Unnamed: 0,userId,movieId,rating,Adventure,Comedy,Action,Mystery,Crime,Thriller,Drama,...,Documentary,Sci-Fi,Fantasy,Film-Noir,Western,Musical,Romance,(no genres listed),War,Cluster
0,1,1,4.0,1,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,5,1,4.0,1,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,7,1,4.5,1,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,15,1,2.5,1,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,17,1,4.5,1,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,610,160341,2.5,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,5
100832,610,160527,4.5,0,0,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,5
100833,610,160836,3.0,0,0,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,5
100834,610,163937,3.5,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,5


In [18]:
# # now we will create a new dataframe involving only userid movieid and cluster
data_frame = pd.DataFrame()
new_dataframe=pd.DataFrame(new_dataframe)
new_dataframe.head()
new_dataframe.rename(columns = {0:'movieId',1:'Cluster'},inplace=True)
new_dataframe.drop_duplicates(inplace=True)
new_dataframe

Unnamed: 0,movieId,Cluster
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0
...,...,...
9719,193581,5
9720,193583,5
9721,193585,5
9722,193587,5


In [19]:
import random
data = pd.read_csv("https://raw.githubusercontent.com/BheemisettySaiHarsha/PRML_project/main/PRML%20DATASET/movies.csv")
new_data=pd.merge(new_dataframe,data,on='movieId')
new_data.fillna(random.randint(0,9),inplace=True)
new_data

Unnamed: 0,movieId,Cluster,title,genres
0,1,0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,0,Jumanji (1995),Adventure|Children|Fantasy
2,3,0,Grumpier Old Men (1995),Comedy|Romance
3,4,0,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,0,Father of the Bride Part II (1995),Comedy
...,...,...,...,...
9719,193581,5,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9720,193583,5,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9721,193585,5,Flint (2017),Drama
9722,193587,5,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [20]:
import os

# Get the directory of the current Jupyter Notebook file
current_directory = os.getcwd()

# Directory path where you want to save the pickled file
directory = os.path.join(current_directory)

# Create the directory if it doesn't exist
os.makedirs(directory, exist_ok=True)

# Save the new_dataframe DataFrame as a pickled file
new_data.to_pickle(os.path.join(directory, 'new_dataframe_pickle.pkl'))
