# Collarative Filtering - On Yuen Shern

## Import libraries

In [79]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
import numpy as np

## Read CSV

In [110]:
# Why using ISO-8859-1?
# Having invalid start byte (not utf-8 caharacters) 
reviews = pd.read_csv("data/reviews.csv", encoding="ISO-8859-1")
print(reviews)

animes = pd.read_csv("data/animes.csv", encoding="utf-8")
print(animes)

           uid          profile anime_uid rating  \
0       255938   DesolatePsyche     34096      8   
1       259117        baekbeans     34599     10   
2       253664             skrn     28891      7   
3         8254     edgewalker00      2904      9   
4       291149  aManOfCulture99      4181     10   
...        ...              ...       ...    ...   
101456  218144           iankki     25013      9   
101457   29009              Elh      6634     10   
101458   30106     brass2themax      6634      9   
101459  304505           FrozYn      4548      2   
101460  262452        SmokyChip     34561      8   

                                                   scores  \
0       {'Overall': '8', 'Story': '8', 'Animation': '8...   
1       {'Overall': '10', 'Story': '10', 'Animation': ...   
2       {'Overall': '7', 'Story': '7', 'Animation': '9...   
3       {'Overall': '9', 'Story': '9', 'Animation': '9...   
4       {'Overall': '10', 'Story': '10', 'Animation': ...   
...      

In [90]:
reviews = reviews[['profile', 'anime_uid', 'rating']].dropna()
print(reviews)

                profile anime_uid rating
0        DesolatePsyche     34096      8
1             baekbeans     34599     10
2                  skrn     28891      7
3          edgewalker00      2904      9
4       aManOfCulture99      4181     10
...                 ...       ...    ...
101456           iankki     25013      9
101457              Elh      6634     10
101458     brass2themax      6634      9
101459           FrozYn      4548      2
101460        SmokyChip     34561      8

[99943 rows x 3 columns]


## Check Data Format

In [99]:
reviews['anime_uid'] = reviews['anime_uid'].astype(str)
reviews['rating'] = pd.to_numeric(reviews['rating'], errors='coerce')
reviews = reviews.dropna(subset=['rating'])

print(reviews['anime_uid'])
print("-----------------------------------------------------")
print(reviews['rating'])
print("-----------------------------------------------------")
print(reviews)

0         34096
1         34599
2         28891
3          2904
4          4181
          ...  
101456    25013
101457     6634
101458     6634
101459     4548
101460    34561
Name: anime_uid, Length: 99618, dtype: object
-----------------------------------------------------
0          8.0
1         10.0
2          7.0
3          9.0
4         10.0
          ... 
101456     9.0
101457    10.0
101458     9.0
101459     2.0
101460     8.0
Name: rating, Length: 99618, dtype: float64
-----------------------------------------------------
           uid          profile anime_uid  rating  \
0       255938   DesolatePsyche     34096     8.0   
1       259117        baekbeans     34599    10.0   
2       253664             skrn     28891     7.0   
3         8254     edgewalker00      2904     9.0   
4       291149  aManOfCulture99      4181    10.0   
...        ...              ...       ...     ...   
101456  218144           iankki     25013     9.0   
101457   29009              Elh      

## Create each user data

In [100]:
user_item_matrix = reviews.pivot_table(
    index='profile',
    columns='anime_uid',
    values='rating'
).fillna(0)

print("Matrix shape:", user_item_matrix.shape)

Matrix shape: (36544, 4036)


## Calculate Similarity

In [101]:
item_similarity = cosine_similarity(user_item_matrix.T)
item_similarity_df = pd.DataFrame(
    item_similarity,
    index=user_item_matrix.columns,
    columns=user_item_matrix.columns
)

print(item_similarity_df)

anime_uid         1      1000     10012     10014      1002     10029  \
anime_uid                                                               
1          1.000000  0.038940  0.009359  0.034822  0.012221  0.029365   
1000       0.038940  1.000000  0.000000  0.000000  0.000000  0.000000   
10012      0.009359  0.000000  1.000000  0.000000  0.013010  0.000000   
10014      0.034822  0.000000  0.000000  1.000000  0.000000  0.000000   
1002       0.012221  0.000000  0.013010  0.000000  1.000000  0.000000   
...             ...       ...       ...       ...       ...       ...   
9982       0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
9988       0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
9989       0.028265  0.032752  0.038412  0.000000  0.008034  0.003482   
9990       0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
9996       0.035741  0.000000  0.000000  0.194476  0.000000  0.000000   

anime_uid     1003     10030  10043     10049  ...

## Make connect map from review to animes

In [115]:
# To get anime title by using UID
anime_map = dict(zip(animes['uid'].astype(str), animes['title']))

## Recommender Function

In [118]:
def recommend_items(user_id, n=10):
    if user_id not in user_item_matrix.index:
        return "User not found"
    
    user_ratings = user_item_matrix.loc[user_id]
    scores = item_similarity_df.dot(user_ratings) / item_similarity_df.sum(axis=1)
    
    # Filter out items the user has already rated
    scores = scores[user_ratings == 0]
    
    # Take Top N
    top_scores = scores.sort_values(ascending=False).head(n)
    
    # Format recommendations nicely
    recommendations = []
    for rank, (anime_id, score) in enumerate(top_scores.items(), start=1):
        title = anime_map.get(anime_id, anime_id)
        recommendations.append(f"{rank}. {title} (Score: {score:.2f})")
    
    return "\n".join(recommendations)

# Example
recommendations = recommend_items("DesolatePsyche", 10)
print("Top 10 recommendations:\n" + recommendations)

Top 10 recommendations:
1. To LOVE-Ru Darkness 2nd Specials (Score: 1.64)
2. Kinnikuman II Sei (Score: 1.61)
3. Kodomo no Jikan OVA (Score: 1.45)
4. Magical Moe (Score: 1.30)
5. Junai Maniac (Score: 1.28)
6. Jungle Book Shounen Mowgli (Score: 1.26)
7. Yuru Yuri Nachuyachumi!+ (Score: 1.01)
8. Oni Chichi 2: Revenge (Score: 0.94)
9. Itsuka Aeru Kimi ni (Score: 0.89)
10. Happy Party Train (Score: 0.80)
