<a href="https://colab.research.google.com/github/Abhiramias09/Anime_recommendation-system-by-LGBMRanker/blob/main/miniproject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import zipfile
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from lightgbm import LGBMRanker
import pickle

# Unzip the dataset
zipped_data = zipfile.ZipFile("/content/anime-recommendation-ltr-dataset.zip")

# Load the datasets
anime_info_df = pd.read_csv(zipped_data.open('anime_info.csv'))
relavence_scores = pd.read_csv(zipped_data.open('relavence_scores.csv'))
user_info = pd.read_csv(zipped_data.open('user_info.csv'))

# Define popular genres
popular_genres = ['Comedy', 'Action', 'Fantasy', 'Adventure', 'Kids', 'Drama', 'Sci-Fi', 'Music', 'Shounen', 'Slice of Life']

# Function to create genre flags
def create_genre_flags(df, popular_genres):
    df = df.dropna(subset=['Genres'])
    df['Genres'] = df['Genres'].apply(lambda x: ",".join(s.strip() for s in x.split(",")))
    mlb = MultiLabelBinarizer()
    genre_df = pd.DataFrame(mlb.fit_transform(df['Genres'].str.split(',')),
                            columns=mlb.classes_,
                            index=df.index)
    new_df = pd.concat([df['anime_id'], genre_df[popular_genres]], axis=1)
    new_df.columns = ['anime_id'] + popular_genres
    return new_df

# Create genre flags and merge with anime info
anime_genre_info_df = create_genre_flags(anime_info_df, popular_genres)
anime_info_df_final = anime_info_df.merge(anime_genre_info_df, on='anime_id')
anime_info_df_final.columns = [col if col == 'anime_id' else f"ANIME_FEATURE {col}".upper() for col in anime_info_df_final.columns]

# Rename user info columns
user_info.columns = [col if col == 'user_id' else f"USER_FEATURE {col}".upper() for col in user_info.columns]

# Merge dataframes
train_interim = relavence_scores.merge(anime_info_df_final)
train = train_interim.merge(user_info, how='inner')

# Drop columns with more than 50% missing values
na_counts = (train.isna().sum() * 100 / len(train))
train_processed = train.drop(na_counts[na_counts > 50].index, axis=1)

# Sort by user_id
train_processed.sort_values(by='user_id', inplace=True)

# Define features and target
features = ['anime_id', 'user_id', 'ANIME_FEATURE IS_TV',
       'ANIME_FEATURE YEAR_AIRED', 'ANIME_FEATURE IS_ADULT',

       'ANIME_FEATURE ABOVE_FIVE_STAR_RATINGS',
       'ANIME_FEATURE ABOVE_FIVE_STAR_RATIO', 'ANIME_FEATURE COMEDY',
       'ANIME_FEATURE ACTION', 'ANIME_FEATURE FANTASY',
       'ANIME_FEATURE ADVENTURE','ANIME_FEATURE DRAMA',
       'ANIME_FEATURE SCI-FI', 'ANIME_FEATURE MUSIC', 'ANIME_FEATURE SHOUNEN',
       'ANIME_FEATURE SLICE OF LIFE', 'USER_FEATURE REVIEW_COUNT',
       'USER_FEATURE AVG_SCORE', 'USER_FEATURE SCORE_STDDEV',
       'USER_FEATURE ABOVE_FIVE_STAR_COUNT',
       'USER_FEATURE ABOVE_FIVE_STAR_RATIO']
target = 'relavence_score'

# Function to limit the number of entries per user to 10,000
def limit_entries_per_user(df, limit=10000):
    limited_df = df.groupby('user_id').apply(lambda x: x.head(limit)).reset_index(drop=True)
    return limited_df

# Apply the limit to the training data
train_processed_limited = limit_entries_per_user(train_processed)

# Split data into training and test sets
test_size = int(1e5)
X, y = train_processed_limited[features], train_processed_limited[target].apply(lambda x: int(x * 10))
test_idx_start = len(X) - test_size

# Extract user_id before splitting
xtrain, xtest, ytrain, ytest = X.iloc[0:test_idx_start], X.iloc[test_idx_start:], y.iloc[0:test_idx_start], y.iloc[test_idx_start:]

# Function to get group sizes
def get_group_size(df):
    return df.groupby('user_id').size()

# Get group sizes
train_groups = get_group_size(train_processed_limited.iloc[0:test_idx_start])
test_groups = get_group_size(train_processed_limited.iloc[test_idx_start:])

print(sum(train_groups), sum(test_groups))
# (4764372, 100000)

# Train the model
model = LGBMRanker(objective="lambdarank")
model.fit(xtrain, ytrain, group=train_groups.tolist(), eval_set=[(xtest, ytest)], eval_group=[test_groups.tolist()], eval_metric=['ndcg'])

#save the model
with open('anime_recommendation_model.pkl', 'wb') as f:
    pickle.dump(model, f)
import numpy as np
user_2_anime_df = relavence_scores.groupby("user_id").agg({"anime_id":lambda x:list(set(x))})
user_2_anime_map = dict(zip(user_2_anime_df.index,user_2_anime_df['anime_id']))

#create candidate pool, this will be a all the animes in the database
candidate_pool = anime_info_df_final['anime_id'].unique().tolist()

#anime_id to it's name mapping
anime_id_2_name = relavence_scores.drop_duplicates(subset=["anime_id","Name"])[['anime_id',"Name"]]
anime_id_2_name_map = dict(zip(anime_id_2_name['anime_id'],anime_id_2_name['Name']))

def candidate_generation(user_id:int,candidate_pool:list,user_2_anime_map:dict,N:int):
    """
    Note: this a totally random generation, only for demo purpose
    Generates a list of N anime candidates for a given user based on their previously liked animes.

    Parameters:
        user_id (int): The user's ID.
        candidate_pool (list): A list of all possible anime candidates.
        user_2_anime_map (dict): A dictionary that maps users to their liked animes.
        N (int): The number of anime candidates to generate.

    Returns:
        already_interacted (list): List of animes which user already liked
        candidates (list): A list of N anime candidates for the user.
    """

    #get the already liked animes
    already_interacted = user_2_anime_map[user_id]

    #candidates will be rest of animes which are not exposed to user
    candidates = list(set(candidate_pool) - set(already_interacted))

    return already_interacted,np.random.choice(candidates,size=N)

def generate_predictions(user_id,user_2_anime_map,candidate_pool,feature_columns,anime_id_2_name_map,ranker,N=100):
    """
    Generates predictions for anime recommendations for a given user.

    Parameters:
        user_id (int): The user's ID.
        user_2_anime_map (dict): A dictionary that maps users to their liked animes.
        candidate_pool (list): A list of all possible anime candidates.
        feature_columns (list): A list of feature columns to use for generating predictions.
        anime_id_2_name_map (dict): A dictionary that maps anime IDs to their names.
        ranker (object): A trained model object that is used to generate predictions.
        N (int): The number of anime predictions to generate.

    Returns:
        predictions (DataFrame): A dataframe containing the top N anime recommendations for the user.
    """
    already_liked,candidates = candidate_generation(user_id,candidate_pool,user_2_anime_map,N=10000)

    #Create dataframe for candidates
    candidates_df = pd.DataFrame(data=pd.Series(candidates,name='anime_id'))

    # Merge with feature dataframe
    features = anime_info_df_final.merge(candidates_df)

    #Add user id as a feature
    features['user_id'] = user_id

    # Merge with user information
    features = features.merge(user_info)

    # If number of already liked animes is less than number of candidates
    # Extend the already liked list with -1
    already_liked = list(already_liked)
    if len(already_liked) < len(candidates):
        append_list = np.full(fill_value=-1,shape=(len(candidates)-len(already_liked)))
        already_liked.extend(list(append_list))

    #Create dataframe for predictions
    predictions = pd.DataFrame(index=candidates)
    #Add anime names
    predictions['name'] = np.array([anime_id_2_name_map.get(id_) for id_ in candidates])
    #Generate predictions
    predictions['score'] = ranker.predict(features[feature_columns])
    predictions = predictions.sort_values(by='score',ascending=False).head(N)

    predictions[f'already_liked - sample[{N}]'] = [anime_id_2_name_map.get(id_) for id_ in already_liked[0:len(predictions)]]
    return predictions

#let's generate the predictions
generate_predictions(123,user_2_anime_map,candidate_pool,feature_columns=features,anime_id_2_name_map=anime_id_2_name_map,ranker=model,N=10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Genres'] = df['Genres'].apply(lambda x: ",".join(s.strip() for s in x.split(",")))


4774372 100000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.918345 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2389
[LightGBM] [Info] Number of data points in the train set: 4774372, number of used features: 21


Unnamed: 0,name,score,already_liked - sample[10]
9002,True Tears: Raigomaru to Jibeta no Monogatari,4.327594,Majo no Takkyuubin
16482,Ojarumaru: Mangetsu Road Kiki Ippatsu - Tama n...,4.300468,Tenkuu no Shiro Laputa
3990,Kumo to Tulip,4.176066,Pumpkin Scissors
35204,The King of Fighters: Destiny,4.176066,Omoide Poroporo
2171,Detective Conan Movie 11: Jolly Roger in the D...,4.072907,Heisei Tanuki Gassen Ponpoko
30932,Oz no Mahoutsukai no Koutsuu Anzen no Tabi,4.072907,Tonari no Totoro
6272,Zakuro Yashiki,4.068001,Zetsuai 1989
18841,PES: Peace Eco Smile - Drive your Heart,4.062544,Monster
29764,Blend,4.062544,xxxHOLiC Kei
1711,Ryuusei Sentai Musumet,3.880011,Shounen Onmyouji


In [None]:
import pandas as pd
import zipfile

# Create anime_info DataFrame
anime_info_data = {
    'anime_id': [1, 2, 3, 4, 5],
    'Genres': ['Action, Adventure, Comedy, Drama, Sci-Fi, Space','Action, Adventure, Comedy, Drama, Sci-Fi, Space','Action, Sci-Fi, Comedy','Action, Adventure, Drama, Fantasy, Supernatural','Action, Adventure, Drama, Fantasy, Supernatural'],
    'is_tv': [1, 0, 1, 1, 0],
    'year_aired': [1998, 2001, 1998, 2003, 2004],
    'is_adult': [0] * 5,
    'above_five_star_ratings': [800, 200, 500, 150, 50],
    'above_five_star_ratio': [0.84, 0.78, 0.72, 0.65, 0.60]
}
anime_info_df = pd.DataFrame(anime_info_data)

# Create relavence_scores DataFrame
relavence_scores_data = {
    'anime_id': [1, 2, 3, 4, 5],
    'Name': ['Cowboy Bebop', 'Cowboy Bebop: The Movie', 'Trigun', 'Wolf\'s Rain', 'Wolf\'s Rain OVA'],
    'user_id': [1, 1, 2, 2, 3],
    'relavence_score': [0.8, 0.7, 0.9, 0.6, 0.8]
}
relavence_scores_df = pd.DataFrame(relavence_scores_data)

# Create user_info DataFrame
user_info_data = {
    'user_id': [1, 2, 3],
    'review_count': [200, 150, 100],
    'avg_score': [8.5, 7.8, 9.2],
    'score_stddev': [1.2, 1.5, 0.8],
    'above_five_star_count': [150, 100, 80],
    'above_five_star_ratio': [0.75, 0.80, 0.90]
}
user_info_df = pd.DataFrame(user_info_data)

# Create zip file and add CSV files
with zipfile.ZipFile('anime_dataset.zip', 'w') as zipf:
    anime_info_df.to_csv('anime_info_new.csv', index=False)
    relavence_scores_df.to_csv('relavence_scores_new.csv', index=False)
    user_info_df.to_csv('user_info_new.csv', index=False)
    zipf.write('anime_info_new.csv')
    zipf.write('relavence_scores_new.csv')
    zipf.write('user_info_new.csv')

# Optional: Remove individual CSV files after zipping
import os
os.remove('anime_info_new.csv')
os.remove('relavence_scores_new.csv')
os.remove('user_info_new.csv')

print("Dataset files created and zipped successfully.")

Dataset files created and zipped successfully.


In [None]:
import pandas as pd
import zipfile

# Step 1: Extract data from the zip file

# Specify the path to your zip file
zip_file_path = 'anime_dataset.zip'

# Create a ZipFile object
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Extract all contents
    zip_ref.extractall()

# Step 2: Load data into Pandas DataFrames

# Load anime_info_new.csv into a DataFrame
anime_info_df = pd.read_csv('anime_info_new.csv')

# Load relavence_scores_new.csv into a DataFrame
relavence_scores_df = pd.read_csv('relavence_scores_new.csv')

# Load user_info_new.csv into a DataFrame
user_info_df = pd.read_csv('user_info_new.csv')

# Step 3: Optionally, remove extracted CSV files

# Clean up extracted CSV files
import os
os.remove('anime_info_new.csv')
os.remove('relavence_scores_new.csv')
os.remove('user_info_new.csv')

# Step 4: Verify and use the loaded DataFrames
print("anime_info_df:")
print(anime_info_df.head())

print("\nrelavence_scores_df:")
print(relavence_scores_df.head())

print("\nuser_info_df:")
print(user_info_df.head())


anime_info_df:
   anime_id                                           Genres  is_tv  \
0         1  Action, Adventure, Comedy, Drama, Sci-Fi, Space      1   
1         2  Action, Adventure, Comedy, Drama, Sci-Fi, Space      0   
2         3                           Action, Sci-Fi, Comedy      1   
3         4  Action, Adventure, Drama, Fantasy, Supernatural      1   
4         5  Action, Adventure, Drama, Fantasy, Supernatural      0   

   year_aired  is_adult  above_five_star_ratings  above_five_star_ratio  
0        1998         0                      800                   0.84  
1        2001         0                      200                   0.78  
2        1998         0                      500                   0.72  
3        2003         0                      150                   0.65  
4        2004         0                       50                   0.60  

relavence_scores_df:
   anime_id                     Name  user_id  relavence_score
0         1             Cowbo

In [None]:
import pandas as pd
import zipfile
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
import pickle

# Load the trained model
with open('anime_recommendation_model.pkl', 'rb') as f:
    model = pickle.load(f)

# Get the feature names from the model
model_features = model.feature_name_

# Define the features we currently have
current_features = ['anime_id', 'user_id', 'ANIME_FEATURE_IS_TV', 'ANIME_FEATURE_YEAR_AIRED',
                    'ANIME_FEATURE_IS_ADULT', 'ANIME_FEATURE_ABOVE_FIVE_STAR_RATINGS',
                    'ANIME_FEATURE_ABOVE_FIVE_STAR_RATIO', 'ANIME_FEATURE_COMEDY',
                    'ANIME_FEATURE_ACTION', 'ANIME_FEATURE_FANTASY',
                    'ANIME_FEATURE_ADVENTURE', 'ANIME_FEATURE_DRAMA', 'ANIME_FEATURE_SCI-FI',
                    'ANIME_FEATURE_MUSIC', 'ANIME_FEATURE_SHOUNEN', 'ANIME_FEATURE_SLICE_OF_LIFE',
                    'USER_FEATURE REVIEW_COUNT', 'USER_FEATURE AVG_SCORE',
                    'USER_FEATURE SCORE_STDDEV', 'USER_FEATURE ABOVE_FIVE_STAR_COUNT',
                    'USER_FEATURE ABOVE_FIVE_STAR_RATIO']

# Identify missing features
missing_features = [feature for feature in model_features if feature not in current_features]
print(f"Missing features: {missing_features}")

# Unzip the dataset
zipped_data = zipfile.ZipFile("anime_dataset.zip")

# Load the datasets
anime_info_df = pd.read_csv(zipped_data.open('anime_info_new.csv'))
relavence_scores = pd.read_csv(zipped_data.open('relavence_scores_new.csv'))
user_info = pd.read_csv(zipped_data.open('user_info_new.csv'))

# Define popular genres
popular_genres = ['Action', 'Comedy', 'Fantasy', 'Adventure', 'Drama', 'Sci-Fi', 'Music', 'Shounen', 'Slice of Life']

# Function to create genre flags
def create_genre_flags(df, popular_genres):
    df['Genres'] = df['Genres'].fillna('')  # Replace NaN values with empty string
    df['Genres'] = df['Genres'].str.split(', ')
    mlb = MultiLabelBinarizer(classes=popular_genres)
    genre_flags = mlb.fit_transform(df['Genres'])
    genre_df = pd.DataFrame(genre_flags, columns=mlb.classes_, index=df.index)
    new_df = pd.concat([df['anime_id'], genre_df], axis=1)
    return new_df

# Create genre flags and merge with anime info
anime_genre_info_df = create_genre_flags(anime_info_df, popular_genres)
anime_info_df_final = anime_info_df.merge(anime_genre_info_df, on='anime_id')

# Ensure all popular genres columns are present
for genre in popular_genres:
    if genre not in anime_info_df_final.columns:
        anime_info_df_final[genre] = 0

# Rename columns to match the model's expected feature names
anime_info_df_final.rename(columns={
    'is_tv': 'ANIME_FEATURE_IS_TV',
    'year_aired': 'ANIME_FEATURE_YEAR_AIRED',
    'is_adult': 'ANIME_FEATURE_IS_ADULT',
    'above_five_star_ratings': 'ANIME_FEATURE_ABOVE_FIVE_STAR_RATINGS',
    'above_five_star_ratio': 'ANIME_FEATURE_ABOVE_FIVE_STAR_RATIO',
    'Action': 'ANIME_FEATURE_ACTION',
    'Comedy': 'ANIME_FEATURE_COMEDY',
    'Fantasy': 'ANIME_FEATURE_FANTASY',
    'Adventure': 'ANIME_FEATURE_ADVENTURE',
    'Drama': 'ANIME_FEATURE_DRAMA',
    'Sci-Fi': 'ANIME_FEATURE_SCI-FI',
    'Music': 'ANIME_FEATURE_MUSIC',
    'Shounen': 'ANIME_FEATURE_SHOUNEN',
    'Slice of Life': 'ANIME_FEATURE_SLICE_OF_LIFE'
}, inplace=True)

# Rename user info columns
user_info.columns = [col if col == 'user_id' else f"USER_FEATURE {col}".upper() for col in user_info.columns]

# Merge dataframes
test_interim = relavence_scores.merge(anime_info_df_final, on='anime_id')
test = test_interim.merge(user_info, on='user_id', how='inner')

# Define the function to limit entries per user (if needed)
def limit_entries_per_user(df, limit=8000):
    return df.groupby('user_id').apply(lambda x: x.head(limit)).reset_index(drop=True)

# Limit entries per user (if needed)
test_limited = limit_entries_per_user(test)

# Sort by user_id (if needed)
test_limited.sort_values(by='user_id', inplace=True)

# Define features (ensure consistency with the training data)
features = ['anime_id', 'user_id', 'ANIME_FEATURE_IS_TV',
       'ANIME_FEATURE_YEAR_AIRED', 'ANIME_FEATURE_IS_ADULT',
       'ANIME_FEATURE_ABOVE_FIVE_STAR_RATINGS',
       'ANIME_FEATURE_ABOVE_FIVE_STAR_RATIO', 'ANIME_FEATURE_COMEDY',
       'ANIME_FEATURE_ACTION', 'ANIME_FEATURE_FANTASY',
       'ANIME_FEATURE_ADVENTURE', 'ANIME_FEATURE_DRAMA', 'ANIME_FEATURE_SCI-FI',
       'ANIME_FEATURE_MUSIC', 'ANIME_FEATURE_SHOUNEN',
       'ANIME_FEATURE_SLICE_OF_LIFE', 'USER_FEATURE REVIEW_COUNT',
       'USER_FEATURE AVG_SCORE', 'USER_FEATURE SCORE_STDDEV',
       'USER_FEATURE ABOVE_FIVE_STAR_COUNT',
       'USER_FEATURE ABOVE_FIVE_STAR_RATIO']

# Ensure all necessary columns are present in test_limited
for col in missing_features:
    test_limited[col] = 0  # or np.nan, depending on the data type and model requirements

# Ensure all features are present in the correct order
all_features = features + missing_features
test_limited = test_limited[all_features]

# Define the candidate_generation function
def candidate_generation(user_id, candidate_pool, user_2_anime_map, N=8000):
    already_liked = set(user_2_anime_map.get(user_id, []))
    candidates = list(set(candidate_pool) - already_liked)
    return already_liked, candidates[:N]
# Define anime_id_2_name_map
anime_id_2_name_map = {row['anime_id']: row['Name'] for _, row in anime_info_df.iterrows()}

# Define the generate_predictions function as before
def generate_predictions(user_id, user_2_anime_map, candidate_pool, feature_columns, anime_id_2_name_map, ranker, N=10):
    already_liked, candidates = candidate_generation(user_id, candidate_pool, user_2_anime_map, N=10000)

    # Create dataframe for candidates
    candidates_df = pd.DataFrame(data=pd.Series(candidates, name='anime_id'))

    # Merge with feature dataframe
    features = anime_info_df_final.merge(candidates_df, on='anime_id')

    # Add user id as a feature
    features['user_id'] = user_id

    # Merge with user information
    features = features.merge(user_info, on='user_id')

    # If number of already liked animes is less than number of candidates
    # Extend the already liked list with -1
    already_liked = list(already_liked)
    if len(already_liked) < len(candidates):
        append_list = np.full(fill_value=-1, shape=(len(candidates) - len(already_liked)))
        already_liked.extend(list(append_list))

    # Create dataframe for predictions
    predictions = pd.DataFrame(index=candidates)
    # Add anime names
    predictions['name'] = [anime_id_2_name_map.get(id_) for id_ in candidates]
    # Generate predictions
    predictions['score'] = ranker.predict(features[feature_columns])
    predictions = predictions.sort_values(by='score', ascending=False).head(N)

    predictions[f'already_liked - sample[{N}]'] = [anime_id_2_name_map.get(id_) for id_ in already_liked[0:len(predictions)]]
    return predictions

# Define user_id, user_2_anime_map, and candidate_pool
user_id = 1  # Example user_id
user_2_anime_map = {}  # Provide the actual mapping (e.g., {1: [10, 20, 30]})
candidate_pool = anime_info_df_final['anime_id'].tolist()  # Example candidate pool

# Generate predictions
predictions = generate_predictions(user_id, user_2_anime_map, candidate_pool, features, anime_id_2_name_map, model, N=10)

# Display the top recommendations
print(predictions)







Missing features: ['USER_FEATURE_REVIEW_COUNT', 'USER_FEATURE_AVG_SCORE', 'USER_FEATURE_SCORE_STDDEV', 'USER_FEATURE_ABOVE_FIVE_STAR_COUNT', 'USER_FEATURE_ABOVE_FIVE_STAR_RATIO']




KeyError: 'Name'

In [None]:
import pandas as pd
import zipfile

# Load the datasets from the zip file
with zipfile.ZipFile("anime_dataset.zip", "r") as z:
    with z.open('anime_info_new.csv') as f1, z.open('relavence_scores_new.csv') as f2, z.open('user_info_new.csv') as f3:
        anime_info_df = pd.read_csv(f1)
        relavence_scores_df = pd.read_csv(f2)
        user_info_df = pd.read_csv(f3)

# Verify column names in relavence_scores_df
print("Column names in relavence_scores_df:", relavence_scores_df.columns)

# Define anime_id_2_name_map using the 'Name' column from relavence_scores_df
anime_id_2_name_map = {row['anime_id']: row['Name'] for _, row in relavence_scores_df.iterrows()}

# Print the anime_id_2_name_map to verify
print(anime_id_2_name_map)



Column names in relavence_scores_df: Index(['anime_id', 'Name', 'user_id', 'relavence_score'], dtype='object')
{1: 'Cowboy Bebop', 2: 'Cowboy Bebop: The Movie', 3: 'Trigun', 4: "Wolf's Rain", 5: "Wolf's Rain OVA"}


In [None]:
import pandas as pd
import zipfile
from sklearn.preprocessing import MultiLabelBinarizer
import pickle

# Unzip the dataset
zipped_data = zipfile.ZipFile("anime_dataset.zip")

# Load the datasets
anime_info_df = pd.read_csv(zipped_data.open('anime_info_new.csv'))
relavence_scores = pd.read_csv(zipped_data.open('relavence_scores_new.csv'))
user_info = pd.read_csv(zipped_data.open('user_info_new.csv'))

# Define popular genres
popular_genres = ['Action', 'Comedy', 'Fantasy', 'Adventure', 'Drama', 'Sci-Fi', 'Music', 'Shounen', 'Slice of Life']

# Function to create genre flags
def create_genre_flags(df, popular_genres):
    df['Genres'] = df['Genres'].fillna('')  # Replace NaN values with empty string
    df['Genres'] = df['Genres'].str.split(', ')
    mlb = MultiLabelBinarizer(classes=popular_genres)
    genre_flags = mlb.fit_transform(df['Genres'])
    genre_df = pd.DataFrame(genre_flags, columns=mlb.classes_, index=df.index)
    new_df = pd.concat([df['anime_id'], genre_df], axis=1)
    return new_df

# Create genre flags and merge with anime info
anime_genre_info_df = create_genre_flags(anime_info_df, popular_genres)
anime_info_df_final = anime_info_df.merge(anime_genre_info_df, on='anime_id')

# Ensure all popular genres columns are present
for genre in popular_genres:
    if genre not in anime_info_df_final.columns:
        anime_info_df_final[genre] = 0

print(anime_info_df_final.head())  # Check the merged dataframe to see if genre columns are present

# Rename user info columns
user_info.columns = [col if col == 'user_id' else f"USER_FEATURE {col}".upper() for col in user_info.columns]

# Merge dataframes
test_interim = relavence_scores.merge(anime_info_df_final, on='anime_id')
test = test_interim.merge(user_info, on='user_id', how='inner')

# Define the function to limit entries per user (if needed)
def limit_entries_per_user(df, limit=10000):
    return df.groupby('user_id').apply(lambda x: x.head(limit)).reset_index(drop=True)

# Limit entries per user (if needed)
test_limited = limit_entries_per_user(test)

# Sort by user_id (if needed)
test_limited.sort_values(by='user_id', inplace=True)

# Define features (ensure consistency with the training data)
features = ['anime_id', 'user_id', 'is_tv',
       'year_aired', 'is_adult',
       'above_five_star_ratings',
       'above_five_star_ratio', 'Comedy',
       'Action', 'Fantasy',
       'Adventure', 'Drama', 'Sci-Fi',
       'Music', 'Shounen',
       'Slice of Life', 'USER_FEATURE REVIEW_COUNT',
       'USER_FEATURE AVG_SCORE', 'USER_FEATURE SCORE_STDDEV',
       'USER_FEATURE ABOVE_FIVE_STAR_COUNT',
       'USER_FEATURE ABOVE_FIVE_STAR_RATIO']

# Ensure all necessary columns are present in test_limited
missing_columns = [col for col in features if col not in test_limited.columns]
for col in missing_columns:
    test_limited[col] = 0  # or np.nan, depending on the data type and model requirements

# Filter to only the necessary columns
test_limited = test_limited[features]

# Load the trained model
with open('anime_recommendation_model.pkl', 'rb') as f:
    model = pickle.load(f)

# Generate predictions for a specific user
user_id = 2  # Change this to the user_id you want recommendations for

# Define the function to generate predictions (assuming it is already defined)
# Make sure the function is adjusted to take these inputs correctly
def generate_predictions(user_id, user_2_anime_map, candidate_pool, feature_columns, anime_id_2_name_map, ranker, N=10):
    """
    Generates predictions for anime recommendations for a given user.

    Parameters:
        user_id (int): The user's ID.
        user_2_anime_map (dict): A dictionary that maps users to their liked animes.
        candidate_pool (list): A list of all possible anime candidates.
        feature_columns (list): A list of feature columns to use for generating predictions.
        anime_id_2_name_map (dict): A dictionary that maps anime IDs to their names.
        ranker (object): A trained model object that is used to generate predictions.
        N (int): The number of anime predictions to generate.

    Returns:
        predictions (DataFrame): A dataframe containing the top N anime recommendations for the user.
    """
    already_liked, candidates = candidate_generation(user_id, candidate_pool, user_2_anime_map, N=10000)

    # Create dataframe for candidates
    candidates_df = pd.DataFrame(data=pd.Series(candidates, name='anime_id'))

    # Merge with feature dataframe
    features = anime_info_df_final.merge(candidates_df, on='anime_id')

    # Add user id as a feature
    features['user_id'] = user_id

    # Merge with user information
    features = features.merge(user_info, on='user_id')

    # If number of already liked animes is less than number of candidates
    # Extend the already liked list with -1
    already_liked = list(already_liked)
    if len(already_liked) < len(candidates):
        append_list = np.full(fill_value=-1, shape=(len(candidates) - len(already_liked)))
        already_liked.extend(list(append_list))

    # Create dataframe for predictions
    predictions = pd.DataFrame(index=candidates)
    # Add anime names
    predictions['name'] = [anime_id_2_name_map.get(id_) for id_ in candidates]
    # Generate predictions
    predictions['score'] = ranker.predict(features[feature_columns])
    predictions = predictions.sort_values(by='score', ascending=False).head(N)

    predictions[f'already_liked - sample[{N}]'] = [anime_id_2_name_map.get(id_) for id_ in already_liked[0:len(predictions)]]
    return predictions

# Candidate generation function
def candidate_generation(user_id, candidate_pool, user_2_anime_map, N=100):
    """
    Generates a list of N anime candidates for a given user based on their previously liked animes.

    Parameters:
        user_id (int): The user's ID.
        candidate_pool (list): A list of all possible anime candidates.
        user_2_anime_map (dict): A dictionary that maps users to their liked animes.
        N (int): The number of anime candidates to generate.

    Returns:
        already_interacted (list): List of animes which user already liked.
        candidates (list): A list of N anime candidates for the user.
    """
    # Get the already liked animes
    already_interacted = user_2_anime_map.get(user_id, [])

    # Candidates will be the rest of animes which are not exposed to user
    candidates = list(set(candidate_pool) - set(already_interacted))

    return already_interacted, np.random.choice(candidates, size=N)

# Create candidate pool and mappings
candidate_pool = anime_info_df_final['anime_id'].unique().tolist()
anime_id_2_name = relavence_scores.drop_duplicates(subset=["anime_id", "Name"])[['anime_id', "Name"]]
anime_id_2_name_map = dict(zip(anime_id_2_name['anime_id'], anime_id_2_name['Name']))

# Create user to anime map
user_2_anime_df = relavence_scores.groupby("user_id").agg({"anime_id": lambda x: list(set(x))})
user_2_anime_map = dict(zip(user_2_anime_df.index, user_2_anime_df['anime_id']))

# Generate predictions
predictions = generate_predictions(user_id, user_2_anime_map, candidate_pool, features, anime_id_2_name_map, model, N=10)

# Display the top recommendations
print(predictions)





   anime_id                                             Genres  is_tv  \
0         1  [Action, Adventure, Comedy, Drama, Sci-Fi, Space]      1   
1         2  [Action, Adventure, Comedy, Drama, Sci-Fi, Space]      0   
2         3                           [Action, Sci-Fi, Comedy]      1   
3         4  [Action, Adventure, Drama, Fantasy, Supernatural]      1   
4         5  [Action, Adventure, Drama, Fantasy, Supernatural]      0   

   year_aired  is_adult  above_five_star_ratings  above_five_star_ratio  \
0        1998         0                      800                   0.84   
1        2001         0                      200                   0.78   
2        1998         0                      500                   0.72   
3        2003         0                      150                   0.65   
4        2004         0                       50                   0.60   

   Action  Comedy  Fantasy  Adventure  Drama  Sci-Fi  Music  Shounen  \
0       1       1        0          1 