In [118]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from src.cleaning import remove_mature_content

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [3]:
scraped_data = pd.read_csv(r'..\..\data\raw\AnimeData_300724.csv')
scraped_data.columns = scraped_data.columns.str.lower().str.replace(' ', '_')
scraped_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26794 entries, 0 to 26793
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         26794 non-null  object 
 1   episodes      13552 non-null  float64
 2   release_year  26794 non-null  int64  
 3   status        26389 non-null  object 
 4   genres        21414 non-null  object 
 5   studio        25663 non-null  object 
 6   source        23896 non-null  object 
 7   demographic   10142 non-null  object 
 8   themes        0 non-null      float64
 9   synopsis      26785 non-null  object 
 10  voters        13078 non-null  float64
 11  rating        646 non-null    float64
dtypes: float64(4), int64(1), object(7)
memory usage: 2.5+ MB


In [4]:
impute_data = pd.read_csv(r"..\..\data\raw\anime-dataset-2023.csv")
impute_data.columns = impute_data.columns.str.lower().str.replace(' ', '_')
impute_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24905 entries, 0 to 24904
Data columns (total 24 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   anime_id      24905 non-null  int64 
 1   name          24905 non-null  object
 2   english_name  24905 non-null  object
 3   other_name    24905 non-null  object
 4   score         24905 non-null  object
 5   genres        24905 non-null  object
 6   synopsis      24905 non-null  object
 7   type          24905 non-null  object
 8   episodes      24905 non-null  object
 9   aired         24905 non-null  object
 10  premiered     24905 non-null  object
 11  status        24905 non-null  object
 12  producers     24905 non-null  object
 13  licensors     24905 non-null  object
 14  studios       24905 non-null  object
 15  source        24905 non-null  object
 16  duration      24905 non-null  object
 17  rating        24905 non-null  object
 18  rank          24905 non-null  object
 19  popu

In [5]:
scraped_columns = set(scraped_data.columns)
impute_columns = set(impute_data.columns)
scraped_columns.intersection(impute_columns)

{'episodes', 'genres', 'rating', 'source', 'status', 'synopsis'}

In [6]:
scraped_data.nunique()

title           26757
episodes          251
release_year      106
status              2
genres            966
studio           1026
source             16
demographic         5
themes              0
synopsis        21732
voters            971
rating             92
dtype: int64

In [7]:
impute_data.nunique()

anime_id        24905
name            24901
english_name    10134
other_name      23796
score             567
genres           1006
synopsis        20113
type                7
episodes          252
aired           15213
premiered         244
status              3
producers        4423
licensors         265
studios          1547
source             17
duration          331
rating              7
rank            15198
popularity      18363
favorites        1814
scored_by        8281
members         10996
image_url       24720
dtype: int64

In [8]:
scraped_data['synopsis'].value_counts()[:2]

synopsis
(No synopsis yet.)                                                                               4772
No synopsis has been added for this series yet.\r\n\r\nClick here to update this information.      25
Name: count, dtype: int64

In [9]:
scraped_data = scraped_data.replace(
    '(No synopsis yet.)', np.nan).replace(
        'No synopsis has been added for this series yet.\r\n\r\nClick here to update this information.', np.nan)

In [96]:
def merge_anime_datasets(
    df1: pd.DataFrame, 
    df2: pd.DataFrame, 
    df1_title_column: str,
    df2_title_column,
    avg_episode_duration=23) -> pd.DataFrame:
    """
    Merges two anime datasets based on anime titles and applies custom rules
    for handling common columns.
    
    :param df1: First dataset (with 'title' as the anime name).
    :param df2: Second dataset (with 'name' as the anime name).
        
    :returns: Merged dataset with common columns handled according to custom logic.
    :rtype: pd.DataFrame
    """
    df1[df1_title_column] = df1[df1_title_column].str.strip().str.lower()
    df2[df2_title_column] = df2[df2_title_column].str.strip().str.lower()
    
    merged_df = pd.merge(df1, df2, left_on=df1_title_column, right_on=df2_title_column, how='outer')

    merged_df['episodes'] = pd.to_numeric(merged_df['episodes_x'], errors='coerce').combine(
        pd.to_numeric(merged_df['episodes_y'], errors='coerce'), max, fill_value=np.nan)
    
    # Handle 'genres': concatenate only if they are exclusive
    def combine_genres(genre1, genre2):
        if pd.isna(genre1):
            return genre2
        if pd.isna(genre2):
            return genre1
        genre1_set = set(genre1.split())
        genre2_set = set(genre2.split())
        if genre1_set == genre2_set:  # If the sets are the same, no need to concatenate
            return genre1
        else:
            combined_genres = " ".join(sorted(genre1_set.union(genre2_set)))
            return combined_genres
    
    merged_df['genres'] = merged_df.apply(lambda row: combine_genres(row['genres_x'], row['genres_y']), axis=1)
    
    merged_df['rating'] = pd.to_numeric(merged_df['rating_x'], errors='coerce').combine(
        pd.to_numeric(merged_df['rating_y'], errors='coerce'), max, fill_value=np.nan)
    
    merged_df['popularity'] = pd.to_numeric(merged_df['popularity'], errors='coerce').combine(
        pd.to_numeric(merged_df['rank'], errors='coerce'), max, fill_value=np.nan)
    
    merged_df['source'] = merged_df['source_y']
    
    merged_df['status'] = merged_df['status_x'].combine_first(merged_df['status_y'])
    
    merged_df['synopsis'] = merged_df['synopsis_x'].fillna('') + ' ' + merged_df['synopsis_y'].fillna('')
    merged_df['synopsis'] = merged_df['synopsis'].str.strip()
    
     # Calculate total duration based on episode count * average episode duration (in minutes)
    merged_df['total_duration_minutes'] = merged_df['episodes'] * avg_episode_duration
    
    # Convert total duration from minutes to hours
    merged_df['total_duration_hours'] = merged_df['total_duration_minutes'] / 60
    
    # Create duration-based categories
    def classify_total_duration(duration_hours):
        if pd.isna(duration_hours):
            return 'Unknown'
        elif duration_hours <= 2:
            return 'Very Short (Movie/Special)'
        elif duration_hours <= 6:
            return 'Short (2-6 hours)'
        elif duration_hours <= 12:
            return 'Medium (6-12 hours)'
        elif duration_hours <= 30:
            return 'Long (12-30 hours)'
        elif duration_hours <= 100:
            return 'Very Long (30-100 hours)'
        else:
            return 'Epic (>100 hours)'
    
    # Apply the function to classify based on total duration
    merged_df['duration_category'] = merged_df['total_duration_hours'].apply(classify_total_duration)
    
    merged_df = merged_df.drop(columns=['episodes_x', 'episodes_y', 'genres_x', 'genres_y',
                                        'rating_x', 'rating_y', 'source_x', 'status_x', 
                                        'status_y', 'synopsis_x', 'synopsis_y', 'name'])
    return merged_df

In [97]:
df = merge_anime_datasets(scraped_data, impute_data, 'title', 'name')

In [98]:
df.shape

(27601, 33)

In [99]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27601 entries, 0 to 27600
Data columns (total 33 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   title                   26799 non-null  object 
 1   release_year            26799 non-null  float64
 2   studio                  25668 non-null  object 
 3   demographic             10143 non-null  object 
 4   themes                  0 non-null      float64
 5   voters                  13079 non-null  float64
 6   anime_id                24932 non-null  float64
 7   english_name            24932 non-null  object 
 8   other_name              24932 non-null  object 
 9   score                   24932 non-null  object 
 10  type                    24932 non-null  object 
 11  aired                   24932 non-null  object 
 12  premiered               24932 non-null  object 
 13  producers               24932 non-null  object 
 14  licensors               24932 non-null

In [100]:
df[['duration_category']].value_counts()

duration_category         
Unknown                       14047
Short (2-6 hours)              4762
Very Short (Movie/Special)     3136
Medium (6-12 hours)            2725
Long (12-30 hours)             2264
Very Long (30-100 hours)        566
Epic (>100 hours)               101
Name: count, dtype: int64

In [101]:
df.isnull().sum()

title                       802
release_year                802
studio                     1933
demographic               17458
themes                    27601
voters                    14522
anime_id                   2669
english_name               2669
other_name                 2669
score                      2669
type                       2669
aired                      2669
premiered                  2669
producers                  2669
licensors                  2669
studios                    2669
source_y                   2669
duration                   2669
rank                       2669
popularity                 2669
favorites                  2669
scored_by                  2669
members                    2669
image_url                  2669
episodes                  14047
genres                      784
rating                    26955
source                     2669
status                      281
synopsis                      0
total_duration_minutes    14047
total_du

In [102]:
df = df.drop(columns=['rating', 'source_y', 'rank', 'premiered'])

In [103]:
df['duration_category'].unique()

array(['Unknown', 'Very Short (Movie/Special)', 'Short (2-6 hours)',
       'Medium (6-12 hours)', 'Long (12-30 hours)',
       'Very Long (30-100 hours)', 'Epic (>100 hours)'], dtype=object)

In [104]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27601 entries, 0 to 27600
Data columns (total 29 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   title                   26799 non-null  object 
 1   release_year            26799 non-null  float64
 2   studio                  25668 non-null  object 
 3   demographic             10143 non-null  object 
 4   themes                  0 non-null      float64
 5   voters                  13079 non-null  float64
 6   anime_id                24932 non-null  float64
 7   english_name            24932 non-null  object 
 8   other_name              24932 non-null  object 
 9   score                   24932 non-null  object 
 10  type                    24932 non-null  object 
 11  aired                   24932 non-null  object 
 12  producers               24932 non-null  object 
 13  licensors               24932 non-null  object 
 14  studios                 24932 non-null

In [105]:
df.isnull().sum()

title                       802
release_year                802
studio                     1933
demographic               17458
themes                    27601
voters                    14522
anime_id                   2669
english_name               2669
other_name                 2669
score                      2669
type                       2669
aired                      2669
producers                  2669
licensors                  2669
studios                    2669
duration                   2669
popularity                 2669
favorites                  2669
scored_by                  2669
members                    2669
image_url                  2669
episodes                  14047
genres                      784
source                     2669
status                      281
synopsis                      0
total_duration_minutes    14047
total_duration_hours      14047
duration_category             0
dtype: int64

In [106]:
def preprocess_text(text: str) -> str:
    """
    Tokenize, remove stopwords, and apply stemming to the text.

    :param text: Input text to preprocess.
    :return: Preprocessed text as a single string.
    :rtype: str
    """
    tokens = word_tokenize(text.lower())
    processed = [stemmer.stem(word) for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(processed)

In [108]:
df['synopsis'] = df['synopsis'].apply(preprocess_text)

In [117]:
df['genres'].sample(5)

23397          Action, Fantasy
3791     Comedy, Slice of Life
11489                  UNKNOWN
828                     Hentai
20748                  UNKNOWN
Name: genres, dtype: object

In [120]:
df[['score','popularity']]

Unnamed: 0,score,popularity
0,6.01,14667.0
1,4.65,8712.0
2,UNKNOWN,14939.0
3,UNKNOWN,22924.0
4,6.86,4675.0
...,...,...
27596,,
27597,,
27598,3.75,12635.0
27599,UNKNOWN,0.0


In [121]:
def calculate_hype(df: pd.DataFrame):
    df['score'] = pd.to_numeric(df['score'], errors='coerce').fillna(1) 
    df['popularity'] = pd.to_numeric(df['popularity'], errors='coerce').fillna(1)
    
    df['hype'] = df['score'] * df['popularity']
    
    return df

In [122]:
df = calculate_hype(df)

In [123]:
df[['score','popularity','hype']]

Unnamed: 0,score,popularity,hype
0,6.01,14667.0,88148.67
1,4.65,8712.0,40510.80
2,1.00,14939.0,14939.00
3,1.00,22924.0,22924.00
4,6.86,4675.0,32070.50
...,...,...,...
27596,1.00,1.0,1.00
27597,1.00,1.0,1.00
27598,3.75,12635.0,47381.25
27599,1.00,0.0,0.00


In [124]:
df.to_csv(r'..\..\data\final\AnimeData_25092024.csv', index=False)