In [None]:
pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3163760 sha256=c747eda17b4ee441e012a8c6b6288e76614c28c51f18b28b288b4e24a1252633
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.cluster import DBSCAN

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
column_names = ['User_ID', 'MovieID', 'Rating', 'Timestamp']
ratings = pd.read_csv("/content/drive/MyDrive/ml-1m/ratings.dat", sep = "::", names = column_names, engine='python')

column_names = ['User_ID', 'Gender', 'Age', 'Occupation', 'Zip-code']
users = pd.read_csv("/content/drive/MyDrive/ml-1m/users.dat", sep = "::", names = column_names, engine='python')

column_names = ['MovieID', 'Title', 'Genres']
movies = pd.read_csv("/content/drive/MyDrive/ml-1m/movies.dat", sep = "::", names = column_names, encoding='latin-1', engine='python')

In [None]:
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
print(ratings.isnull().sum())
print(users.isnull().sum())
print(movies.isnull().sum())

User_ID      0
MovieID      0
Rating       0
Timestamp    0
dtype: int64
User_ID       0
Gender        0
Age           0
Occupation    0
Zip-code      0
dtype: int64
MovieID    0
Title      0
Genres     0
dtype: int64


In [None]:
movies[['Genre 1', 'Genre 2', 'Genre 3', 'Genre 4', 'Genre 5', 'Genre 6']] = movies.Genres.str.split("|",expand=True)
movies.head()

Unnamed: 0,MovieID,Title,Genres,Genre 1,Genre 2,Genre 3,Genre 4,Genre 5,Genre 6
0,1,Toy Story (1995),Animation|Children's|Comedy,Animation,Children's,Comedy,,,
1,2,Jumanji (1995),Adventure|Children's|Fantasy,Adventure,Children's,Fantasy,,,
2,3,Grumpier Old Men (1995),Comedy|Romance,Comedy,Romance,,,,
3,4,Waiting to Exhale (1995),Comedy|Drama,Comedy,Drama,,,,
4,5,Father of the Bride Part II (1995),Comedy,Comedy,,,,,


In [None]:
'''
  We are considering top 4 genres because, the rest of the genres are given only to a very few movies
'''

movies = movies.drop(['Genre 5', 'Genre 6'],axis=1)
movies.head()

Unnamed: 0,MovieID,Title,Genres,Genre 1,Genre 2,Genre 3,Genre 4
0,1,Toy Story (1995),Animation|Children's|Comedy,Animation,Children's,Comedy,
1,2,Jumanji (1995),Adventure|Children's|Fantasy,Adventure,Children's,Fantasy,
2,3,Grumpier Old Men (1995),Comedy|Romance,Comedy,Romance,,
3,4,Waiting to Exhale (1995),Comedy|Drama,Comedy,Drama,,
4,5,Father of the Bride Part II (1995),Comedy,Comedy,,,


In [None]:
movies_titles = movies['Title'].values

In [None]:
np.save('/content/drive/MyDrive/ml-1m/movies_titles',movies_titles)

In [None]:
movies = movies.drop(['Genres'],axis=1)
movies.head()

Unnamed: 0,MovieID,Title,Genre 1,Genre 2,Genre 3,Genre 4
0,1,Toy Story (1995),Animation,Children's,Comedy,
1,2,Jumanji (1995),Adventure,Children's,Fantasy,
2,3,Grumpier Old Men (1995),Comedy,Romance,,
3,4,Waiting to Exhale (1995),Comedy,Drama,,
4,5,Father of the Bride Part II (1995),Comedy,,,


In [None]:
#To get unique elements in each Genre
movies['Genre 1'].unique()

array(['Animation', 'Adventure', 'Comedy', 'Action', 'Drama', 'Thriller',
       'Crime', 'Romance', "Children's", 'Documentary', 'Sci-Fi',
       'Horror', 'Western', 'Mystery', 'Film-Noir', 'War', 'Fantasy',
       'Musical'], dtype=object)

In [None]:
movies.fillna('None', inplace=True)

In [None]:
genres = ["Children's", 'Romance', 'Drama', 'None', 'Crime', 'Adventure',
       'Horror', 'Thriller', 'Comedy', 'Sci-Fi', 'War', 'Mystery',
       'Musical', 'Film-Noir', 'Fantasy', 'Western', 'Animation',
       'Documentary', 'Action']

In [None]:
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(genres)

In [None]:
df = pd.DataFrame()

In [None]:
df['Genre 1']= label_encoder.transform(movies['Genre 1'])
df['Genre 1'].unique()

array([ 2,  1,  4,  0,  7, 16,  5, 14,  3,  6, 15, 10, 18, 12,  9, 17,  8,
       11])

In [None]:
df['Genre 2']= label_encoder.transform(movies['Genre 2'])
df['Genre 2'].unique()

array([ 3, 14,  7, 13,  5,  1, 10, 16,  4, 15, 17, 12, 11,  9,  8, 18,  2,
        6])

In [None]:
df['Genre 3']= label_encoder.transform(movies['Genre 3'])
df['Genre 3'].unique()

array([ 4,  8, 13, 16, 14,  7, 11,  5, 17, 12, 15, 18, 10,  3,  2,  9])

In [None]:
df['Genre 4']= label_encoder.transform(movies['Genre 4'])
df['Genre 4'].unique()

array([13, 14, 10, 11,  5, 15, 16,  8, 18,  3,  7,  4, 17, 12])

In [None]:
df[['Genre 1', 'Genre 2','Genre 3', 'Genre 4']] =df[['Genre 1','Genre 2','Genre 3', 'Genre 4']] + 1

In [None]:
df.head()

Unnamed: 0,Genre 1,Genre 2,Genre 3,Genre 4
0,3,4,5,14
1,2,4,9,14
2,5,15,14,14
3,5,8,14,14
4,5,14,14,14


In [None]:
# Here 14 represents 'None' category which says that the particular genre is missing for that movie

df['Genre 2'][df['Genre 2']==14]=0
df['Genre 3'][df['Genre 3']==14]=0
df['Genre 1'][df['Genre 1']==14]=0
df['Genre 4'][df['Genre 4']==14]=0

In [None]:
df.head()

Unnamed: 0,Genre 1,Genre 2,Genre 3,Genre 4
0,3,4,5,0
1,2,4,9,0
2,5,15,0,0
3,5,8,0,0
4,5,0,0,0


In [None]:
movies[['Label G1','Label G2','Label G3','Label G4']]=df

In [None]:
movies.head()

Unnamed: 0,MovieID,Title,Genre 1,Genre 2,Genre 3,Genre 4,Label G1,Label G2,Label G3,Label G4
0,1,Toy Story (1995),Animation,Children's,Comedy,,3,4,5,0
1,2,Jumanji (1995),Adventure,Children's,Fantasy,,2,4,9,0
2,3,Grumpier Old Men (1995),Comedy,Romance,,,5,15,0,0
3,4,Waiting to Exhale (1995),Comedy,Drama,,,5,8,0,0
4,5,Father of the Bride Part II (1995),Comedy,,,,5,0,0,0


In [None]:
feature_matrix = movies[['Label G1','Label G2','Label G3','Label G4']].values
cosine_sim = cosine_similarity(feature_matrix, feature_matrix)

In [None]:
print(cosine_sim)

[[1.         0.94282071 0.67082039 ... 0.42426407 0.42426407 0.69249314]
 [0.94282071 1.         0.44052174 ... 0.19900744 0.19900744 0.44486808]
 [0.67082039 0.44052174 1.         ... 0.31622777 0.31622777 0.99303524]
 ...
 [0.42426407 0.19900744 0.31622777 ... 1.         1.         0.42579704]
 [0.42426407 0.19900744 0.31622777 ... 1.         1.         0.42579704]
 [0.69249314 0.44486808 0.99303524 ... 0.42579704 0.42579704 1.        ]]


In [None]:
epsilon = 0.09  # The maximum distance between two samples for one to be considered as in the neighborhood of the other
min_samples = 2  # The number of samples (or total weight) in a neighborhood for a point to be considered as a core point

dbscan = DBSCAN(eps=epsilon, min_samples=min_samples, metric='cosine')
dbscan_labels = dbscan.fit_predict(cosine_sim)

In [None]:
df2 = pd.DataFrame(dbscan_labels)

In [None]:
movies['Category'] = df2+1

In [None]:
# movies = movies.drop(['Label G1','Label G2','Label G3','Label G4'],axis=1)
movies.head()

Unnamed: 0,MovieID,Title,Genre 1,Genre 2,Genre 3,Genre 4,Label G1,Label G2,Label G3,Label G4,Category
0,1,Toy Story (1995),Animation,Children's,Comedy,,3,4,5,0,1
1,2,Jumanji (1995),Adventure,Children's,Fantasy,,2,4,9,0,1
2,3,Grumpier Old Men (1995),Comedy,Romance,,,5,15,0,0,1
3,4,Waiting to Exhale (1995),Comedy,Drama,,,5,8,0,0,1
4,5,Father of the Bride Part II (1995),Comedy,,,,5,0,0,0,2


In [None]:
df_cat0 = pd.DataFrame()
df_cat1 = pd.DataFrame()
df_cat2 = pd.DataFrame()

In [None]:
df_cat0 = movies[['Title','Genre 1','Genre 2','Genre 3','Genre 4']][movies['Category'] == 0]
df_cat1 = movies[['Title','Genre 1','Genre 2','Genre 3','Genre 4']][movies['Category'] == 1]
df_cat2 = movies[['Title','Genre 1','Genre 2','Genre 3','Genre 4']][movies['Category'] == 2]

In [None]:
df_cat0 = df_cat0.drop(['Category'],axis=1)
df_cat1 = df_cat1.drop(['Category'],axis=1)
df_cat2 = df_cat2.drop(['Category'],axis=1)

In [None]:
df_cat0.to_csv("/content/drive/MyDrive/ml-1m/df_cat0.csv")
df_cat1.to_csv("/content/drive/MyDrive/ml-1m/df_cat1.csv")
df_cat2.to_csv("/content/drive/MyDrive/ml-1m/df_cat2.csv")