In [106]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

Here, I have made a custom dataset of descriptions mapped with their labels of category of anime

In [42]:
data = {
    'description': [
        'A romantic comedy about a girl who hides her real appearance.',
        'A thrilling fantasy about a group of warriors fighting monsters.',
        'A slice-of-life romance set in high school with love triangles.',
        'An action-packed story about a superhero saving the world.',
        'A fantasy adventure where magic rules the kingdom.',
        'A mystery drama that follows a detective solving crimes.',
        'A romantic tale set in a magical universe with talking animals.',
        'A comedy about friends navigating college life and love.',
        'A science-fiction tale of humans exploring distant galaxies.',
        'A historical drama about ancient kings and their romances.',
        'A heartwarming romance between childhood friends reunited.',
        'A dark fantasy about a hero battling against a corrupt kingdom.',
        'An adventure-filled story of treasure hunters in a fantasy land.',
        'A supernatural romance between a human and a ghost.',
        'A thrilling action film about a group of friends uncovering a dark secret.',
        'An action drama about a martial artist seeking revenge against an evil lord.',
        'A magical realism story about a girl who can communicate with animals in a fantasy realm.',
        'A romantic drama about two star-crossed lovers from different worlds.',
        'An epic fantasy featuring dragons and ancient prophecies of love.',
        'A comedy about an aspiring musician chasing his dreams and finding love.',
        'A sci-fi story about time travel and the romantic consequences of changing the past.',
        'A romantic fantasy set in a world where love can control magic.',
        'An action-packed sequel where heroes fight to save their kingdom.',
        'A sci-fi adventure exploring the effects of love on alien planets.',
        'A fantasy tale of friendship and romance in an enchanted forest.',
        'A romantic thriller where secrets from the past threaten their future.',
        'An action-packed story of a spy who falls in love while on a mission.',
        'A fantasy story about a love potion gone wrong in a magical town.',
        'A comedic tale of love and rivalry in a sci-fi academy.',
        'A romantic comedy set in the virtual world of a gaming universe.',
        'A thrilling fantasy where a forbidden romance sparks a war.',
        'A sci-fi adventure where love conquers time and space.',
        'An action-packed journey of a hero who fights for love and justice.',
        'A fantasy saga about a young wizard discovering his powers and true love.',
        'A romantic drama about two musicians falling in love amidst their rivalry.',
        'A suspenseful action film where love leads to unexpected twists.',
        'A magical tale of love that bridges two different realms.',
        'A sci-fi thriller exploring the ethical dilemmas of love in a virtual world.',
        'A heartwarming story about finding love in the most unexpected places.',
        'An epic fantasy battle where love and friendship prevail against evil forces.',
        'A romantic adventure of two explorers who find love in an ancient ruin.',
        'A fast-paced action story about a hero who risks everything for love.',
        'A sci-fi romance set in a dystopian future where love is forbidden.',
        'A fantasy comedy about a mischievous fairy who causes romantic chaos.',
        'A chilling thriller about a love that becomes an obsession.',
        'A romantic fantasy about a princess and her secret admirer.',
        'An action-filled epic where warriors fight for love and honor.',
        'A sci-fi comedy where love blossoms between two alien beings.',
        'A sweet romance about a baker and a food critic in a culinary competition.',
        'A magical fantasy adventure about siblings who discover their family secret.',
        'A tragic romance set against the backdrop of a brewing war.',
        'An action-packed saga of a hero who fights against time to save his love.',
        'A heartwarming story of friendship that turns into unexpected romance.',
        'A sci-fi thriller about an AI developing feelings for its creator.',
        'A whimsical fantasy about a love that transcends dimensions.'
    ],
    'category': [
        'romance', 'fantasy', 'romance', 'action', 'fantasy',
        'action', 'romance', 'comedy', 'sci-fi', 'historical',
        'romance', 'fantasy', 'adventure', 'romance', 'thriller',
        'action', 'fantasy', 'romance', 'fantasy', 'comedy',
        'sci-fi', 'romance', 'action', 'sci-fi', 'fantasy',
        'romance', 'action', 'fantasy', 'comedy', 'romance',
        'fantasy', 'sci-fi', 'action', 'fantasy', 'romance',
        'suspense', 'fantasy', 'sci-fi', 'romance', 'fantasy',
        'action', 'sci-fi', 'comedy', 'thriller', 'romance',
        'action', 'sci-fi', 'fantasy', 'romance', 'fantasy',
        'sci-fi', 'action', 'romance', 'sci-fi', 'fantasy'
    ]
}

In [59]:
# These are the specific categories of the anime
category_mapping = {
    'romance': 'romance',
    'fantasy': 'fantasy',
    'mystery': 'mystery',
    'comedy': 'comedy',
    'sci-fi': 'sci-fi',
    'historical': 'drama',
    'adventure': 'adventure',
    'thriller': 'thriller',
    'action': 'action',
    'dark fantasy': 'fantasy',
    'magical realism': 'fantasy',
    'slice-of-life': 'romance',
    'heartwarming': 'romance',
    'tragic': 'romance',
    'supernatural': 'fantasy',
}

Since dataset is small and custom, I needed to balance all the data hence I grouped some anime categories to a broader set of 3 categories: romance, fantasy and action
If some new category is detected, it is mapped as fantasy for now, for a larger dataset, I would have left the categories as is

In [75]:
def consolidate_category(cat):
    if cat in ['romance', 'heartwarming', 'tragic', 'slice-of-life']:
        return 'romance'
    elif cat in ['fantasy', 'dark fantasy', 'magical realism']:
        return 'fantasy'
    elif cat in ['action', 'adventure', 'thriller', 'sci-fi']:
        return 'action'
    else:
        return 'fantasy'

Making a data frame out of the custom data

In [76]:
df = pd.DataFrame(data)
df['category'] = df['category'].map(consolidate_category)

In [77]:
# Checking the class distribution
print(df['category'].value_counts())

category
fantasy    21
action     20
romance    14
Name: count, dtype: int64


In [78]:
# Text preprocessing and TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['description'])
y = df['category']

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


After testing, DecisionTree Model gave best results, hence I have used it in the final model

In [79]:
model = DecisionTreeClassifier(class_weight='balanced')

# Hyperparameter tuning
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")

Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5}


In [98]:
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)


In [105]:
y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 92.73%


In [102]:
# Example descriptions for prediction
new_descriptions = [
    "A young hero discovers ancient powers while fighting dark forces.",
    "A heartwarming story of two friends who fall in love during summer vacation.",
]

X_new = vectorizer.transform(new_descriptions)

predictions = best_model.predict(X_new)

# Print the predictions
for description, category in zip(new_descriptions, predictions):
    print(f"Description: {description}\nPredicted Category: {category}\n")


Description: A young hero discovers ancient powers while fighting dark forces.
Predicted Category: action

Description: A heartwarming story of two friends who fall in love during summer vacation.
Predicted Category: romance

