In [3]:
import pandas as pd

In [4]:
data_path = '/Users/Dakota/Downloads/ml-100k/u.data'
column_name = ['user_id', 'movie_id', 'rating', 'timestamp']
rating_df = pd.read_csv(data_path, sep='\t', names=column_name)

print("first 5 rows of the rating data:")
print(rating_df)

first 5 rows of the rating data:
       user_id  movie_id  rating  timestamp
0          196       242       3  881250949
1          186       302       3  891717742
2           22       377       1  878887116
3          244        51       2  880606923
4          166       346       1  886397596
...        ...       ...     ...        ...
99995      880       476       3  880175444
99996      716       204       5  879795543
99997      276      1090       1  874795795
99998       13       225       2  882399156
99999       12       203       3  879959583

[100000 rows x 4 columns]


In [10]:
print("Data overview:")
print(rating_df.info())
print("\nSummery statistics:")
print(rating_df.describe())
print("Rating Distribution:")
print(rating_df['rating'].value_counts())
print("\nMissing value in dataset")
print(rating_df.isnull().sum())


Data overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   user_id    100000 non-null  int64
 1   movie_id   100000 non-null  int64
 2   rating     100000 non-null  int64
 3   timestamp  100000 non-null  int64
dtypes: int64(4)
memory usage: 3.1 MB
None

Summery statistics:
            user_id       movie_id         rating     timestamp
count  100000.00000  100000.000000  100000.000000  1.000000e+05
mean      462.48475     425.530130       3.529860  8.835289e+08
std       266.61442     330.798356       1.125674  5.343856e+06
min         1.00000       1.000000       1.000000  8.747247e+08
25%       254.00000     175.000000       3.000000  8.794487e+08
50%       447.00000     322.000000       4.000000  8.828269e+08
75%       682.00000     631.000000       4.000000  8.882600e+08
max       943.00000    1682.000000       5.000000  8.932866e+08
Ra

In [14]:
movie_path = '/Users/Dakota/Downloads/ml-100k/u.item'
movie_columns = ['movie_id', 'title']
movies_df = pd.read_csv(movie_path, sep='|', header=None , encoding='latin-1', usecols=[0,1], names=movie_columns)

print("\nFirst 5 Rows of the Movies Data:")
print(movies_df.head())

rating_df = pd.merge(rating_df, movies_df, on='movie_id')

print("\nMerged DataFrame with movie title:")
print(rating_df.head())


First 5 Rows of the Movies Data:
   movie_id              title
0         1   Toy Story (1995)
1         2   GoldenEye (1995)
2         3  Four Rooms (1995)
3         4  Get Shorty (1995)
4         5     Copycat (1995)

Merged DataFrame with movie title:
   user_id  movie_id  rating  timestamp                       title
0      196       242       3  881250949                Kolya (1996)
1      186       302       3  891717742    L.A. Confidential (1997)
2       22       377       1  878887116         Heavyweights (1994)
3      244        51       2  880606923  Legends of the Fall (1994)
4      166       346       1  886397596         Jackie Brown (1997)


In [15]:
user_item_matrix = rating_df.pivot_table(index='user_id', columns='title', values='rating')

print("\nUser-item interaction matrix (first 5 rows):")
print(user_item_matrix.head())

print("\nMatrix Shape (Number of Users, Number of Movies):")
print(user_item_matrix.shape)

user_item_matrix_filled = user_item_matrix.fillna(0)

print("\nUser-Item Interaction Matrix with NaN Values Filled (First 5 Rows):")
print(user_item_matrix_filled.head())


User-item interaction matrix (first 5 rows):
title    'Til There Was You (1997)  1-900 (1994)  101 Dalmatians (1996)  \
user_id                                                                   
1                              NaN           NaN                    2.0   
2                              NaN           NaN                    NaN   
3                              NaN           NaN                    NaN   
4                              NaN           NaN                    NaN   
5                              NaN           NaN                    2.0   

title    12 Angry Men (1957)  187 (1997)  2 Days in the Valley (1996)  \
user_id                                                                 
1                        5.0         NaN                          NaN   
2                        NaN         NaN                          NaN   
3                        NaN         2.0                          NaN   
4                        NaN         NaN                       

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

user_similarity = cosine_similarity(user_item_matrix_filled)

user_similarity_df = pd.DataFrame(user_similarity,index=user_item_matrix.index, columns=user_item_matrix.index)

print("\nUser similarity matrix:")
print(user_similarity_df.iloc[:5, :5])


User similarity matrix:
user_id         1         2         3         4         5
user_id                                                  
1        1.000000  0.168937  0.048388  0.064561  0.379670
2        0.168937  1.000000  0.113393  0.179694  0.073623
3        0.048388  0.113393  1.000000  0.349781  0.021592
4        0.064561  0.179694  0.349781  1.000000  0.031804
5        0.379670  0.073623  0.021592  0.031804  1.000000
