## Data Preprocessing

In [1]:
# ===============================
# 1. Imports
# ===============================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


## Load and Explore Data

In [2]:
anime_df = pd.read_csv(r'D:\#Great Learning\MY\Project\Anime_Recommendation_System\anime_dataset\anime.csv')
rating_df = pd.read_csv(r'D:\#Great Learning\MY\Project\Anime_Recommendation_System\anime_dataset\rating.csv')

In [3]:
print("Dataset shapes:")
print(f"Anime: {anime_df.shape}")
print(f"Ratings: {rating_df.shape}")

Dataset shapes:
Anime: (12294, 7)
Ratings: (7813737, 3)


In [4]:
# Display first few rows
print("\n--- Anime Dataset ---")
anime_df.head()


--- Anime Dataset ---


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [5]:
print("\n--- Rating Dataset ---")
rating_df.head()


--- Rating Dataset ---


Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [6]:
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [7]:
rating_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7813737 entries, 0 to 7813736
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 178.8 MB


In [8]:
# Basic statistics
print('\n---- Anime Statistics ----')
anime_df.describe()


---- Anime Statistics ----


Unnamed: 0,anime_id,rating,members
count,12294.0,12064.0,12294.0
mean,14058.221653,6.473902,18071.34
std,11455.294701,1.026746,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.88,225.0
50%,10260.5,6.57,1550.0
75%,24794.5,7.18,9437.0
max,34527.0,10.0,1013917.0


In [9]:
print('\n--- Rating Statistics ----')
rating_df.describe()


--- Rating Statistics ----


Unnamed: 0,user_id,anime_id,rating
count,7813737.0,7813737.0,7813737.0
mean,36727.96,8909.072,6.14403
std,20997.95,8883.95,3.7278
min,1.0,1.0,-1.0
25%,18974.0,1240.0,6.0
50%,36791.0,6213.0,7.0
75%,54757.0,14093.0,9.0
max,73516.0,34519.0,10.0


In [10]:
# Check for missing values
print("\n--- Missing Values ---")
anime_df.isnull().sum()

# genre has 62, type 25, rating 230 null values


--- Missing Values ---


anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [11]:
rating_df.isnull().sum()
# rating has 0 null values

user_id     0
anime_id    0
rating      0
dtype: int64

## Data Preprocessing

### Enhanced data preprocessing for anime recommendation system



### Improved anime data preprocessing     
    Key improvements:
        - Impute missing ratings with global median (more robust than mean)
        - Handle missing genres more intelligently
        - Extract year from anime names if available
        - Create genre binary matrix for better content-based filtering
        
        




In [33]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

class AnimeDataPreprocessor:
    """Enhanced data preprocessing for anime recommendation system"""
    
    def __init__(self):
        self.genre_encoder = LabelEncoder()
        self.type_encoder = LabelEncoder()
        
    def preprocess_anime_data(self, anime_df):
        """
        Improved anime data preprocessing
        
        Key improvements:
        - Impute missing ratings with global median (more robust than mean)
        - Handle missing genres more intelligently
        - Extract year from anime names if available
        - Create genre binary matrix for better content-based filtering
        """
        anime_clean = anime_df.copy()
        
        # 1. Handle missing ratings with median (more robust to outliers)
        global_rating_median = anime_clean['rating'].median()
        anime_clean['rating'].fillna(global_rating_median, inplace=True)
        
        # 2. Intelligent genre handling
        # Instead of 'unknown', use most common genre for that type
        def impute_genre_by_type(row):
            if pd.isna(row['genre']) or row['genre'] == '':
                anime_type = row['type']
                # Get most common genre for this type
                common_genre = anime_clean[
                    (anime_clean['type'] == anime_type) & 
                    (anime_clean['genre'].notna())
                ]['genre'].mode()
                return common_genre[0] if len(common_genre) > 0 else 'Action'
            return row['genre']
        
        anime_clean['genre'] = anime_clean.apply(impute_genre_by_type, axis=1)
        anime_clean['type'].fillna('unknown', inplace=True)
        
        # 3. Create multi-hot encoded genre matrix
        anime_clean['genre_list'] = anime_clean['genre'].str.split(', ')
        
        # Get all unique genres
        all_genres = set()
        for genres in anime_clean['genre_list'].dropna():
            all_genres.update(genres)
        
        # Create binary columns for each genre
        for genre in all_genres:
            anime_clean[f'genre_{genre}'] = anime_clean['genre_list'].apply(
                lambda x: 1 if genre in x else 0
            )
        
        # 4. Extract metadata features
        anime_clean['episodes_numeric'] = pd.to_numeric(
            anime_clean['episodes'], errors='coerce'
        )
        anime_clean['episodes_numeric'].fillna(
            anime_clean['episodes_numeric'].median(), inplace=True
        )
        
        # 5. Create popularity score
        anime_clean['popularity_score'] = (
            anime_clean['rating'] * np.log1p(anime_clean['members'])
        )
        
        return anime_clean
    
    def preprocess_ratings(self, rating_df, min_user_ratings=5, min_anime_ratings=10):
        """
        Improved ratings preprocessing
        
        Key improvements:
        - Keep -1 ratings as implicit feedback (user watched but didn't rate)
        - Filter out users/anime with too few interactions
        - Create explicit vs implicit feedback flags
        """
        rating_clean = rating_df.copy()
        
        # 1. Separate explicit ratings (-1 = implicit feedback)
        rating_clean['is_explicit'] = (rating_clean['rating'] != -1).astype(int)
        rating_clean['rating_explicit'] = rating_clean['rating'].where(
            rating_clean['rating'] != -1, np.nan
        )
        
        # 2. Filter sparse users and anime
        user_counts = rating_clean.groupby('user_id').size()
        anime_counts = rating_clean.groupby('anime_id').size()
        
        valid_users = user_counts[user_counts >= min_user_ratings].index
        valid_anime = anime_counts[anime_counts >= min_anime_ratings].index
        
        rating_clean = rating_clean[
            (rating_clean['user_id'].isin(valid_users)) &
            (rating_clean['anime_id'].isin(valid_anime))
        ]
        
        # 3. Create user and anime statistics
        user_stats = rating_clean.groupby('user_id').agg({
            'rating_explicit': ['mean', 'std', 'count']
        }).reset_index()
        user_stats.columns = ['user_id', 'user_avg_rating', 'user_rating_std', 'user_num_ratings']
        
        anime_stats = rating_clean.groupby('anime_id').agg({
            'rating_explicit': ['mean', 'std', 'count']
        }).reset_index()
        anime_stats.columns = ['anime_id', 'anime_avg_rating', 'anime_rating_std', 'anime_num_ratings']
        
        rating_clean = rating_clean.merge(user_stats, on='user_id', how='left')
        rating_clean = rating_clean.merge(anime_stats, on='anime_id', how='left')
        
        # 4. Normalize ratings by user bias
        rating_clean['rating_normalized'] = (
            rating_clean['rating_explicit'] - rating_clean['user_avg_rating']
        )
        
        return rating_clean
    
    def create_train_test_split(self, rating_df, test_size=0.2, temporal_split=True):
        """
        Create train/test split with temporal awareness if possible
        
        temporal_split: If True, split by time (recent interactions in test)
                       If False, random split
        """
        if temporal_split and 'timestamp' in rating_df.columns:
            # Sort by timestamp and split
            rating_sorted = rating_df.sort_values('timestamp')
            split_idx = int(len(rating_sorted) * (1 - test_size))
            train_data = rating_sorted.iloc[:split_idx]
            test_data = rating_sorted.iloc[split_idx:]
        else:
            # Random split but maintain user representation
            from sklearn.model_selection import train_test_split
            train_data, test_data = train_test_split(
                rating_df, test_size=test_size, random_state=42, 
                stratify=rating_df['user_id'] if len(rating_df['user_id'].unique()) > 1 else None
            )
        
        return train_data, test_data

# Usage example
preprocessor = AnimeDataPreprocessor()

# Load data (assuming anime_df and rating_df are loaded)
# anime_clean = preprocessor.preprocess_anime_data(anime_df)
# rating_clean = preprocessor.preprocess_ratings(rating_df)
# train_data, test_data = preprocessor.create_train_test_split(rating_clean)

In [34]:
preprocessor = AnimeDataPreprocessor()


In [35]:

anime_clean = preprocessor.preprocess_anime_data(anime_df)
print(anime_clean.shape)
anime_clean.head()


(12294, 53)


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,genre_list,genre_Fantasy,genre_Harem,...,genre_Sci-Fi,genre_Vampire,genre_Supernatural,genre_Martial Arts,genre_Shoujo,genre_Cars,genre_Mecha,genre_Horror,episodes_numeric,popularity_score
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,"[Drama, Romance, School, Supernatural]",0,0,...,0,0,1,0,0,0,0,0,1.0,114.400417
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665,"[Action, Adventure, Drama, Fantasy, Magic, Mil...",1,0,...,0,0,0,0,0,0,0,0,64.0,125.791711
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262,"[Action, Comedy, Historical, Parody, Samurai, ...",0,0,...,1,0,0,0,0,0,0,0,51.0,107.727887
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572,"[Sci-Fi, Thriller]",0,0,...,1,0,0,0,0,0,0,0,24.0,123.064625
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266,"[Action, Comedy, Historical, Parody, Samurai, ...",0,0,...,1,0,0,0,0,0,0,0,51.0,109.249504


In [36]:
rating_clean = preprocessor.preprocess_ratings(rating_df)
print(rating_clean.shape)
rating_clean.head()

(7788750, 12)


Unnamed: 0,user_id,anime_id,rating,is_explicit,rating_explicit,user_avg_rating,user_rating_std,user_num_ratings,anime_avg_rating,anime_rating_std,anime_num_ratings,rating_normalized
0,1,20,-1,0,,10.0,0.0,4,7.878229,1.489839,21639,
1,1,24,-1,0,,10.0,0.0,4,8.236913,1.332307,5960,
2,1,79,-1,0,,10.0,0.0,4,7.563742,1.569705,5836,
3,1,226,-1,0,,10.0,0.0,4,8.05791,1.632539,23364,
4,1,241,-1,0,,10.0,0.0,4,6.928869,1.649199,3360,


In [37]:
train_data, test_data = preprocessor.create_train_test_split(rating_clean)

In [38]:
anime_clean.isnull().sum()

anime_id               0
name                   0
genre                  0
type                   0
episodes               0
rating                 0
members                0
genre_list             0
genre_Fantasy          0
genre_Harem            0
genre_Drama            0
genre_Magic            0
genre_Adventure        0
genre_Demons           0
genre_Shounen Ai       0
genre_Shounen          0
genre_Yuri             0
genre_Thriller         0
genre_Seinen           0
genre_Music            0
genre_School           0
genre_Game             0
genre_Historical       0
genre_Mystery          0
genre_Psychological    0
genre_Yaoi             0
genre_Kids             0
genre_Sports           0
genre_Comedy           0
genre_Hentai           0
genre_Samurai          0
genre_Military         0
genre_Parody           0
genre_Ecchi            0
genre_Josei            0
genre_Action           0
genre_Police           0
genre_Shoujo Ai        0
genre_Super Power      0
genre_Dementia         0


In [39]:
ratings_filtered = preprocessor.preprocess_ratings(
    rating_df,
    min_user_ratings=5,
    min_anime_ratings=10
)

print(ratings_filtered.shape)
ratings_filtered.head()


(7788750, 12)


Unnamed: 0,user_id,anime_id,rating,is_explicit,rating_explicit,user_avg_rating,user_rating_std,user_num_ratings,anime_avg_rating,anime_rating_std,anime_num_ratings,rating_normalized
0,1,20,-1,0,,10.0,0.0,4,7.878229,1.489839,21639,
1,1,24,-1,0,,10.0,0.0,4,8.236913,1.332307,5960,
2,1,79,-1,0,,10.0,0.0,4,7.563742,1.569705,5836,
3,1,226,-1,0,,10.0,0.0,4,8.05791,1.632539,23364,
4,1,241,-1,0,,10.0,0.0,4,6.928869,1.649199,3360,


In [40]:
# train_df, test_df = preprocessor.create_train_test_split(
#     ratings_filtered,
#     test_size=0.2,
#     temporal_split=True  # set False if no timestamp
# )

# print("Train:", train_df.shape)
# print("Test :", test_df.shape)


In [41]:
# train_final = preprocessor.add_train_statistics(
#     train_df=train_df,
#     full_df=train_df
# )

# test_final = preprocessor.add_train_statistics(
#     train_df=train_df,
#     full_df=test_df
# )


In [42]:
# train_final = train_final.merge(
#     anime_clean,
#     on='anime_id',
#     how='left'
# )

# test_final = test_final.merge(
#     anime_clean,
#     on='anime_id',
#     how='left'
# )


In [43]:
# mf_features = [
#     'user_id',
#     'anime_id',
#     'rating_normalized',
#     'confidence'
# ]


In [44]:
# rank_features = [
#     'user_avg_rating',
#     'anime_avg_rating',
#     'bayesian_rating',
#     'episodes_numeric',
#     'confidence',
#     'cold_user',
#     'cold_anime'
# ] + [c for c in train_final.columns if c.startswith('genre_')]
