In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import NearestNeighbors
from sklearn.impute import SimpleImputer
import joblib

In [3]:
# Load the dataset
data = pd.read_csv('anime_filtered.csv')
print("Initial data sample:")
print(data.iloc[0])

Initial data sample:
anime_id                                                      11013
title                                                 Inu x Boku SS
title_english                             Inu X Boku Secret Service
title_japanese                                               妖狐×僕SS
title_synonyms                                      Youko x Boku SS
image_url         https://myanimelist.cdn-dena.com/images/anime/...
type                                                             TV
source                                                        Manga
episodes                                                         12
status                                              Finished Airing
airing                                                        False
aired_string                           Jan 13, 2012 to Mar 30, 2012
aired                    {'from': '2012-01-13', 'to': '2012-03-30'}
duration                                            24 min. per ep.
rating                     

In [4]:
columns_to_drop = [
    'anime_id', 'duration','title', 'title_english', 'title_japanese', 'title_synonyms', 
    'image_url', 'rank','source', 'status', 'airing', 'aired', 'aired_string', 
    'background', 'premiered', 'broadcast', 'related', 'producer', 
    'licensor', 'opening_theme', 'ending_theme', 'members', 'favorites', 'scored_by'
]
data = data.drop(columns=columns_to_drop)
print("Data sample after dropping unnecessary columns:")
print(data.iloc[0])


Data sample after dropping unnecessary columns:
type                                              TV
episodes                                          12
rating                     PG-13 - Teens 13 or older
score                                           7.63
popularity                                       231
studio                              David Production
genre         Comedy, Supernatural, Romance, Shounen
Name: 0, dtype: object


In [5]:
# Split and One-Hot Encode the Genre Column
# Use pd.get_dummies to create binary columns for each unique genre
genre_dummies = data['genre'].str.get_dummies(sep=', ')
data = pd.concat([data, genre_dummies], axis=1)

# Drop the original 'genre' column
data = data.drop(columns=['genre'])

# Define numerical and categorical features for the preprocessor
numerical_features = ['episodes', 'score', 'popularity']
categorical_features = ['type','rating', 'studio'] 

# Define the transformers
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Impute missing values with median
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with the most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

# Define the model pipeline
model_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('nearestneighbors', NearestNeighbors(n_neighbors=10, metric='cosine'))
    ]
)

# Fit the preprocessor and the model
X_preprocessed = preprocessor.fit_transform(data)
model_pipeline.named_steps['nearestneighbors'].fit(X_preprocessed)

In [6]:
joblib.dump(model_pipeline, 'anime_recommendation_model.pkl')
joblib.dump(X_preprocessed, 'X_preprocessed.pkl')


['X_preprocessed.pkl']