In [9]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp312-cp312-win_amd64.whl.metadata (14 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.0-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.0-cp312-cp312-win_amd64.whl (10.7 MB)
   ---------------------------------------- 0.0/10.7 MB ? eta -:--:--
   - -------------------------------------- 0.5/10.7 MB 5.6 MB/s eta 0:00:02
   --- ------------------------------------ 1.0/10.7 MB 2.6 MB/s eta 0:00:04
   ---- ----------------------------------- 1.3/10.7 MB 3.0 MB/s eta 0:00:04
   ----- ---------------------------------- 1.6/10.7 MB 2.0 MB/s eta 0:00:05
   ------ --------------------------------- 1.8/10.7 MB 1.8 MB/s eta 0:00:05
   ---------- ---------------------

In [13]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

# Load data
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
movies_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL',
               'unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime',
               'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
               'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

ratings_df = pd.read_csv("u.data", sep='\t', names=ratings_cols, encoding='latin-1')
movies_df = pd.read_csv("u.item", sep='|', names=movies_cols, encoding='latin-1')

# Merge datasets
merged_df = pd.merge(ratings_df, movies_df[['movie_id', 'title']], on='movie_id')

# EDA
ratings_summary = merged_df.groupby('title').agg(
    average_rating=('rating', 'mean'),
    num_ratings=('rating', 'count')
).reset_index()

# User-item matrix
user_movie_matrix = merged_df.pivot_table(index='user_id', columns='title', values='rating').fillna(0)

# Similarity matrix
sparse_matrix = csr_matrix(user_movie_matrix.values)
movie_similarity = cosine_similarity(sparse_matrix.T)
similarity_df = pd.DataFrame(movie_similarity, index=user_movie_matrix.columns, columns=user_movie_matrix.columns)

# Recommend movies
def recommend_movies(movie_name, num_recommendations=5):
    if movie_name not in similarity_df:
        return f"Movie '{movie_name}' not found."
    similar_scores = similarity_df[movie_name].sort_values(ascending=False)[1:num_recommendations+1]
    return similar_scores
         

In [14]:
recommend_movies("Star Wars (1977)")

title
Return of the Jedi (1983)          0.884476
Raiders of the Lost Ark (1981)     0.764885
Empire Strikes Back, The (1980)    0.749819
Toy Story (1995)                   0.734572
Godfather, The (1972)              0.697332
Name: Star Wars (1977), dtype: float64

In [16]:
# Export for Power BI
merged_df[['user_id', 'title', 'rating']].to_csv("merged_movie_ratings.csv", index=False)
ratings_summary.to_csv("movie_summary.csv", index=False)


In [18]:
import os

# List files in current folder
print(os.listdir())


['.anaconda', '.android', '.conda', '.condarc', '.continuum', '.idlerc', '.ipynb_checkpoints', '.ipython', '.jupyter', '.opera', '.vscode', '1.py', '12.jpynb', 'anaconda3', 'ANJU.ipynb', 'AppData', 'Application Data', 'Contacts', 'Cookies', 'Documents', 'Downloads', 'Favorites', 'Links', 'Local Settings', 'merged_movie_ratings.csv', 'Microsoft', 'movie_summary.csv', 'Music', 'My Documents', 'NetHood', 'NTUSER.DAT', 'ntuser.dat.LOG1', 'ntuser.dat.LOG2', 'NTUSER.DAT{31fec68c-032d-11f0-96aa-98b52715dc39}.TM.blf', 'NTUSER.DAT{31fec68c-032d-11f0-96aa-98b52715dc39}.TMContainer00000000000000000001.regtrans-ms', 'NTUSER.DAT{31fec68c-032d-11f0-96aa-98b52715dc39}.TMContainer00000000000000000002.regtrans-ms', 'ntuser.ini', 'OneDrive', 'PrintHood', 'Project2.ipynb', 'Recent', 'Saved Games', 'Searches', 'SendTo', 'Start Menu', 'student.db', 'Templates', 'u.data', 'u.item', 'Untitled.ipynb', 'Untitled1.ipynb', 'Videos', 'WPS Cloud Files']


In [20]:
 print(['merged_movie_ratings.csv', 'movie_summary.csv'])

['merged_movie_ratings.csv', 'movie_summary.csv']
