In [1]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [2]:
# Connect to the SQLite database (since it's in the same folder, you can just use the filename)
db_path = 'metadata-110mil.sqlite'
conn = sqlite3.connect(db_path)

# Check available tables in the database (optional)
tables = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", conn)
print("Available tables:", tables)

# Specify the table you want to load
table_name = 'metadata'  # Replace with your actual table name

# Load the table into a DataFrame
df = pd.read_sql_query(f"SELECT * FROM {table_name};", conn)

# Close the database connection
conn.close()

# Display the first few rows of the DataFrame
df.head()


Available tables:        name
0  metadata


Unnamed: 0,Path,Title,Author,Category,Genre,Language,Status,Published,Updated,Packaged,Rating,Chapters,Words,Publisher,Story URL,Author URL,Summary,word_count,chapter_count,story_id
0,Fanfiction/Halo_ Mass Effect/Completed/Halo_ M...,The Distant Strains of Triumph,SpartanDelta2,"Halo, Mass Effect","Drama, Sci-Fi",English,Completed,2013-07-15,2013-07-15,2013-10-28 09:08:08,M,1,1180,www.fanfiction.net,http://www.fanfiction.net/s/9493934/1/,http://www.fanfiction.net/u/2727455/SpartanDelta2,"A project for my class, a challenge from my te...",1180,1,9493934
1,Fanfiction/Halo_ Mass Effect/Completed/Halo_ M...,Mass Effect: Beyond,Cunningham-Hughes,"Halo, Mass Effect","Adventure, Sci-Fi",English,Completed,2012-10-01,2012-10-01,2013-07-21 09:44:53,T,1,9971,www.fanfiction.net,http://www.fanfiction.net/s/8573953/1/,http://www.fanfiction.net/u/4262625/Cunningham...,A hardcore Mass Effect fan's retelling of the ...,9971,1,8573953
2,Fanfiction/Halo_ Mass Effect/Completed/Halo_ M...,Chaos Infinitium,Sysero of Cain,"Halo, Mass Effect","Adventure, Sci-Fi",English,Completed,2011-03-11,2011-11-17,2014-11-23 22:48:57,T,5,19447,www.fanfiction.net,https://www.fanfiction.net/s/6816070/1/,https://www.fanfiction.net/u/2362265/Sysero-of...,First Contact never seems to go well. But thin...,19447,5,6816070
3,Fanfiction/Halo_ Mass Effect/Completed/Halo_ M...,Tools of Defiance,Magisking,"Halo, Mass Effect","Angst, Tragedy",English,Completed,2013-12-22,2013-12-22,2014-02-07 23:48:12,T,1,997,www.fanfiction.net,https://www.fanfiction.net/s/9946669/1/,https://www.fanfiction.net/u/5244687/Magisking,A One-shot that takes place in the Defiance un...,997,1,9946669
4,Fanfiction/Halo_ Mass Effect/Completed/Halo_ M...,Mass Effect New Origins V2,erttheking,"Halo, Mass Effect","Romance, Sci-Fi",English,Completed,2010-10-05,2012-06-09,2014-05-25 15:31:13,T,109,442687,www.fanfiction.net,https://www.fanfiction.net/s/6376514/1/,https://www.fanfiction.net/u/1835782/erttheking,"On the dawn of the 27th century, the UNSC disc...",442687,109,6376514


In [3]:
test = df.iloc[0]
print(test)

Path             Fanfiction/Halo_ Mass Effect/Completed/Halo_ M...
Title                               The Distant Strains of Triumph
Author                                               SpartanDelta2
Category                                         Halo, Mass Effect
Genre                                                Drama, Sci-Fi
Language                                                   English
Status                                                   Completed
Published                                               2013-07-15
Updated                                                 2013-07-15
Packaged                                       2013-10-28 09:08:08
Rating                                                           M
Chapters                                                         1
Words                                                        1,180
Publisher                                       www.fanfiction.net
Story URL                   http://www.fanfiction.net/s/949393

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams

# Set the font to a universal one like Noto Sans
#rcParams['font.sans-serif'] = ['Noto Sans', 'DejaVu Sans', 'Arial Unicode MS']  # Ensure Noto Sans is installed
#rcParams['axes.unicode_minus'] = False  # Avoid issues with minus signs

# Load your data (assuming it's already in a DataFrame `df`)
# df = pd.read_csv('your_file.csv')  # Uncomment this if you're loading from a CSV file

# Split categories by commas and stack them to get a single column of all categories
#all_categories = df['Category'].str.split(',').explode()

# Strip any extra whitespace from each category (important if there are spaces after commas)
#all_categories = all_categories.str.strip()

# Count occurrences of each unique category
#category_counts = all_categories.value_counts()

# Filter categories to include only those with at least 500 occurrences
#filtered_category_counts = category_counts[category_counts >= 10000]

# Plot as a bar chart
#plt.figure(figsize=(20, 6))
#filtered_category_counts.plot(kind='bar', color='skyblue')
#plt.title('Distribution of Categories (at least 10000 entries)')
#plt.xlabel('Category')
#plt.ylabel('Count')
#plt.xticks(rotation=90, ha='right')
#plt.tight_layout()
#plt.show()


![category_distribution](category_distribution.png)

In [5]:
import matplotlib.pyplot as plt
import seaborn as sns

# Take a random sample of 100,000 rows (adjust the number as needed for performance)
#sample_df = df['word_count'].sample(100000, random_state=42)

#plt.figure(figsize=(10, 6))
#sns.histplot(sample_df, bins=50, kde=True)

#plt.xlabel("Word Count")
#plt.ylabel("Frequency")
#plt.title("Distribution of Word Count (Sampled)")

#plt.show()


![word distribution](word_distribution.png)

<h2> Cleaning and Formating Data: </h2>

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6704321 entries, 0 to 6704320
Data columns (total 20 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   Path           object
 1   Title          object
 2   Author         object
 3   Category       object
 4   Genre          object
 5   Language       object
 6   Status         object
 7   Published      object
 8   Updated        object
 9   Packaged       object
 10  Rating         object
 11  Chapters       object
 12  Words          object
 13  Publisher      object
 14  Story URL      object
 15  Author URL     object
 16  Summary        object
 17  word_count     object
 18  chapter_count  object
 19  story_id       object
dtypes: object(20)
memory usage: 1023.0+ MB


In [7]:
missing_all_values = df[df["Publisher"] == ""].value_counts()#.index.tolist()
len(missing_all_values) # TODO: Clean all that are missing. 79 missing is rows that dont hae any value beside path

79

In [8]:
rows_with_null = (df.isnull()).sum()
rows_with_null

Path             0
Title            0
Author           0
Category         0
Genre            0
Language         0
Status           0
Published        0
Updated          0
Packaged         0
Rating           0
Chapters         0
Words            0
Publisher        0
Story URL        0
Author URL       0
Summary          0
word_count       0
chapter_count    0
story_id         0
dtype: int64

In [9]:
rows_with_missing_values = (df.isnull() | (df == "")).any(axis=1).sum()
rows_with_missing_values

652658

In [10]:
rows_with_missing_values / len(df) * 100

9.734885904180304

In [11]:
all_missing = (df == "").sum()
all_missing

Path                  0
Title               142
Author               83
Category            120
Genre            652471
Language             79
Status               79
Published            81
Updated              81
Packaged             81
Rating               79
Chapters             79
Words                79
Publisher            79
Story URL            79
Author URL           79
Summary             210
word_count           79
chapter_count        79
story_id             79
dtype: int64

In [12]:
# Changing data type for chapter and word count into int type

df['word_count'] = pd.to_numeric(df['word_count'], errors='coerce').astype('Int64')
df['chapter_count'] = pd.to_numeric(df['chapter_count'], errors='coerce').astype('Int64')

In [13]:
#df_usable = df.copy(deep=True) 

# Remove redundant  and unimportant columns

df_usable = df.drop(columns=['Chapters', 'Words','Path','Story URL','Author URL'])

# remove rows where all values are missing (79 of those)

df_usable = df_usable[df_usable['word_count'] != '']

# All where summary, genre and category is missing

df_usable = df_usable[df_usable['Summary'] != '']
df_usable = df_usable[df_usable['Category'] != '']
df_usable = df_usable[df_usable['Genre'] != '']

# Removing extreme word count values  (100 < x < 2,000,000)

df_usable = df_usable[(df_usable['word_count'] > 100) & (df_usable['word_count'] < 2000000)]


In [14]:
df_usable = df_usable.drop(columns=['Packaged', 'Publisher'])

In [15]:
df_usable = df_usable.drop(columns=['Status', 'Published', 'Updated', 'Rating', 'story_id'])


In [16]:
df_usable=df_usable[(df_usable['Language'] == 'English')]

In [17]:
df_usable = df_usable.drop(columns=['Language'])

In [18]:
df_usable = df_usable.drop(columns=['word_count','chapter_count','Author'])

In [19]:
category_counts = df_usable["Category"].value_counts()

# Find categories with more than 1000 occurrences
categories_to_keep = category_counts[category_counts > 1000].index

# Filter the DataFrame
df_usable = df_usable[df_usable["Category"].isin(categories_to_keep)]

len(df_usable)

4599826

In [20]:
# Counting after cleaning all the missing
missing = (df_usable == "").sum()
missing

Title       27
Category     0
Genre        0
Summary      0
dtype: int64

In [21]:
df_usable.columns

Index(['Title', 'Category', 'Genre', 'Summary'], dtype='object')

In [22]:
len(df_usable)

4599826

In [23]:
# Count occurrences of each genre and category
genre_counts = df_usable['Genre'].value_counts()
category_counts = df_usable['Category'].value_counts()

# Filter out rare genres and categories (with less than 2 samples)
min_samples = 2
df_usable = df_usable[
    df_usable['Genre'].isin(genre_counts[genre_counts >= min_samples].index) &
    df_usable['Category'].isin(category_counts[category_counts >= min_samples].index)
]

In [24]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from skmultilearn.model_selection import iterative_train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

import lightgbm as lgb

In [25]:
df_usable['Genre_List'] = df_usable['Genre'].str.split(', ')

In [26]:
df_usable

Unnamed: 0,Title,Category,Genre,Summary,Genre_List
484,Wasted Time,Tomb Raider,"Friendship, Romance",One shot of Sam and Lara in University. Lara d...,"[Friendship, Romance]"
485,Tomb Raider: Let the Rain Fall,Tomb Raider,Drama,Short Story; Lara gets a call from an old frie...,[Drama]
486,Yes,Tomb Raider,"Friendship, Romance","After Alex's death, Lara ponders over her own ...","[Friendship, Romance]"
488,The mask of Tak,Tomb Raider,"Adventure, Romance",1991: Lara goes to a lonely island to find the...,"[Adventure, Romance]"
490,From Death Comes Life,Tomb Raider,"Family, Hurt-Comfort","Lara has always been a people pleaser, but to ...","[Family, Hurt-Comfort]"
...,...,...,...,...,...
6704254,What Once Was Mine,Tangled,"Adventure, Drama",Before Rapunzel and Eugene Fitzherbert came al...,"[Adventure, Drama]"
6704255,Rapunzel's Rebellion,Tangled,"Adventure, Family","Gothel never kidnapped Rapunzel, but life as C...","[Adventure, Family]"
6704256,Michaela Ever After,Tangled,"Adventure, Romance","When Michaela's musical powers are discovered,...","[Adventure, Romance]"
6704257,Tangled Up in Ficlets,Tangled,"Humor, Romance","An ongoing collection of drabbles and ficlets,...","[Humor, Romance]"


In [27]:
df_usable['combined_text'] = df_usable['Title'] + ' ' + df_usable['Summary']

In [28]:
#df_usable = pd.read_csv("filtered_usable_df.csv", sep=";")
#print(df_usable.shape)

In [29]:
df_usable

Unnamed: 0,Title,Category,Genre,Summary,Genre_List,combined_text
484,Wasted Time,Tomb Raider,"Friendship, Romance",One shot of Sam and Lara in University. Lara d...,"[Friendship, Romance]",Wasted Time One shot of Sam and Lara in Univer...
485,Tomb Raider: Let the Rain Fall,Tomb Raider,Drama,Short Story; Lara gets a call from an old frie...,[Drama],Tomb Raider: Let the Rain Fall Short Story; La...
486,Yes,Tomb Raider,"Friendship, Romance","After Alex's death, Lara ponders over her own ...","[Friendship, Romance]","Yes After Alex's death, Lara ponders over her ..."
488,The mask of Tak,Tomb Raider,"Adventure, Romance",1991: Lara goes to a lonely island to find the...,"[Adventure, Romance]",The mask of Tak 1991: Lara goes to a lonely is...
490,From Death Comes Life,Tomb Raider,"Family, Hurt-Comfort","Lara has always been a people pleaser, but to ...","[Family, Hurt-Comfort]",From Death Comes Life Lara has always been a p...
...,...,...,...,...,...,...
6704254,What Once Was Mine,Tangled,"Adventure, Drama",Before Rapunzel and Eugene Fitzherbert came al...,"[Adventure, Drama]",What Once Was Mine Before Rapunzel and Eugene ...
6704255,Rapunzel's Rebellion,Tangled,"Adventure, Family","Gothel never kidnapped Rapunzel, but life as C...","[Adventure, Family]",Rapunzel's Rebellion Gothel never kidnapped Ra...
6704256,Michaela Ever After,Tangled,"Adventure, Romance","When Michaela's musical powers are discovered,...","[Adventure, Romance]",Michaela Ever After When Michaela's musical po...
6704257,Tangled Up in Ficlets,Tangled,"Humor, Romance","An ongoing collection of drabbles and ficlets,...","[Humor, Romance]",Tangled Up in Ficlets An ongoing collection of...


In [30]:
print(df_usable["Category"].value_counts())

Category
Harry Potter        498367
Naruto              260357
Twilight            162406
Glee                 92669
Inuyasha             88590
                     ...  
Gone                  1008
Crossing Jordan       1005
Kamen Rider           1005
Treasure Planet       1004
Samurai Champloo      1003
Name: count, Length: 553, dtype: int64


In [31]:
#df_usable_2 = pd.read_csv("filtered_usable_2_df.csv", sep=";")
#print(df_usable_2.shape)

In [32]:
#df_usable_2

In [33]:
category_counts = df_usable["Category"].value_counts().to_dict()
category_counts

{'Harry Potter': 498367,
 'Naruto': 260357,
 'Twilight': 162406,
 'Glee': 92669,
 'Inuyasha': 88590,
 'Hetalia - Axis Powers': 88037,
 'Supernatural': 84702,
 'Pokémon': 68114,
 'Kingdom Hearts': 63856,
 'Bleach': 59236,
 'Yu-Gi-Oh': 57242,
 'Percy Jackson and the Olympians': 52823,
 'Doctor Who': 51781,
 'Sherlock': 44561,
 'Lord of the Rings': 41337,
 'Hunger Games': 35965,
 'Fullmetal Alchemist': 35860,
 'Wrestling': 35213,
 'Dragon Ball Z': 34618,
 'Avatar: Last Airbender': 34386,
 'Buffy: The Vampire Slayer': 34177,
 'Teen Titans': 33935,
 'Final Fantasy VII': 32793,
 'Digimon': 32042,
 'Fairy Tail': 31812,
 'NCIS': 31619,
 'Vampire Diaries': 31345,
 'Star Wars': 31117,
 'Gundam Wing/AC': 30924,
 'Sonic the Hedgehog': 29558,
 'Sailor Moon': 28941,
 'Once Upon a Time': 28172,
 'Avengers': 26650,
 'Death Note': 26322,
 'Criminal Minds': 24748,
 'CSI': 23186,
 'Legend of Zelda': 23051,
 'Katekyo Hitman Reborn!': 21759,
 'Stargate: SG-1': 21588,
 'Transformers/Beast Wars': 21518,
 'Yu

In [34]:
genre_dict = {}

In [35]:
for i in df_usable["Genre"]:
    list_genre = i.split(', ')
    for a in list_genre:
        if a in genre_dict:
            genre_dict[a] += 1
        else:
            genre_dict[a] = 1

genre_dict

{'Friendship': 417007,
 'Romance': 2541191,
 'Drama': 831868,
 'Adventure': 606753,
 'Family': 289416,
 'Hurt-Comfort': 502046,
 'Suspense': 93899,
 'Fantasy': 128166,
 'Sci-Fi': 76237,
 'Humor': 994363,
 'Angst': 634898,
 'Supernatural': 127426,
 'Tragedy': 184707,
 'Spiritual': 22877,
 'Poetry': 68009,
 'Parody': 85545,
 'Mystery': 98724,
 'Horror': 95062,
 'Crime': 33802,
 'Western': 2512}

In [36]:
genre_dict.keys()

dict_keys(['Friendship', 'Romance', 'Drama', 'Adventure', 'Family', 'Hurt-Comfort', 'Suspense', 'Fantasy', 'Sci-Fi', 'Humor', 'Angst', 'Supernatural', 'Tragedy', 'Spiritual', 'Poetry', 'Parody', 'Mystery', 'Horror', 'Crime', 'Western'])

In [37]:
for genre in genre_dict.keys():
    df_usable[genre] = df_usable['Genre'].apply(lambda x: 1 if genre in str(x).split(', ') else 0)

In [38]:
df_usable

Unnamed: 0,Title,Category,Genre,Summary,Genre_List,combined_text,Friendship,Romance,Drama,Adventure,...,Angst,Supernatural,Tragedy,Spiritual,Poetry,Parody,Mystery,Horror,Crime,Western
484,Wasted Time,Tomb Raider,"Friendship, Romance",One shot of Sam and Lara in University. Lara d...,"[Friendship, Romance]",Wasted Time One shot of Sam and Lara in Univer...,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
485,Tomb Raider: Let the Rain Fall,Tomb Raider,Drama,Short Story; Lara gets a call from an old frie...,[Drama],Tomb Raider: Let the Rain Fall Short Story; La...,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
486,Yes,Tomb Raider,"Friendship, Romance","After Alex's death, Lara ponders over her own ...","[Friendship, Romance]","Yes After Alex's death, Lara ponders over her ...",1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
488,The mask of Tak,Tomb Raider,"Adventure, Romance",1991: Lara goes to a lonely island to find the...,"[Adventure, Romance]",The mask of Tak 1991: Lara goes to a lonely is...,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
490,From Death Comes Life,Tomb Raider,"Family, Hurt-Comfort","Lara has always been a people pleaser, but to ...","[Family, Hurt-Comfort]",From Death Comes Life Lara has always been a p...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6704254,What Once Was Mine,Tangled,"Adventure, Drama",Before Rapunzel and Eugene Fitzherbert came al...,"[Adventure, Drama]",What Once Was Mine Before Rapunzel and Eugene ...,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
6704255,Rapunzel's Rebellion,Tangled,"Adventure, Family","Gothel never kidnapped Rapunzel, but life as C...","[Adventure, Family]",Rapunzel's Rebellion Gothel never kidnapped Ra...,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
6704256,Michaela Ever After,Tangled,"Adventure, Romance","When Michaela's musical powers are discovered,...","[Adventure, Romance]",Michaela Ever After When Michaela's musical po...,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
6704257,Tangled Up in Ficlets,Tangled,"Humor, Romance","An ongoing collection of drabbles and ficlets,...","[Humor, Romance]",Tangled Up in Ficlets An ongoing collection of...,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
df_usable = df_usable.drop(columns=['Genre','Genre_List'])

In [40]:
df_usable

Unnamed: 0,Title,Category,Summary,combined_text,Friendship,Romance,Drama,Adventure,Family,Hurt-Comfort,...,Angst,Supernatural,Tragedy,Spiritual,Poetry,Parody,Mystery,Horror,Crime,Western
484,Wasted Time,Tomb Raider,One shot of Sam and Lara in University. Lara d...,Wasted Time One shot of Sam and Lara in Univer...,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
485,Tomb Raider: Let the Rain Fall,Tomb Raider,Short Story; Lara gets a call from an old frie...,Tomb Raider: Let the Rain Fall Short Story; La...,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
486,Yes,Tomb Raider,"After Alex's death, Lara ponders over her own ...","Yes After Alex's death, Lara ponders over her ...",1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
488,The mask of Tak,Tomb Raider,1991: Lara goes to a lonely island to find the...,The mask of Tak 1991: Lara goes to a lonely is...,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
490,From Death Comes Life,Tomb Raider,"Lara has always been a people pleaser, but to ...",From Death Comes Life Lara has always been a p...,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6704254,What Once Was Mine,Tangled,Before Rapunzel and Eugene Fitzherbert came al...,What Once Was Mine Before Rapunzel and Eugene ...,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6704255,Rapunzel's Rebellion,Tangled,"Gothel never kidnapped Rapunzel, but life as C...",Rapunzel's Rebellion Gothel never kidnapped Ra...,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
6704256,Michaela Ever After,Tangled,"When Michaela's musical powers are discovered,...",Michaela Ever After When Michaela's musical po...,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6704257,Tangled Up in Ficlets,Tangled,"An ongoing collection of drabbles and ficlets,...",Tangled Up in Ficlets An ongoing collection of...,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
df_usable.head()

Unnamed: 0,Title,Category,Summary,combined_text,Friendship,Romance,Drama,Adventure,Family,Hurt-Comfort,...,Angst,Supernatural,Tragedy,Spiritual,Poetry,Parody,Mystery,Horror,Crime,Western
484,Wasted Time,Tomb Raider,One shot of Sam and Lara in University. Lara d...,Wasted Time One shot of Sam and Lara in Univer...,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
485,Tomb Raider: Let the Rain Fall,Tomb Raider,Short Story; Lara gets a call from an old frie...,Tomb Raider: Let the Rain Fall Short Story; La...,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
486,Yes,Tomb Raider,"After Alex's death, Lara ponders over her own ...","Yes After Alex's death, Lara ponders over her ...",1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
488,The mask of Tak,Tomb Raider,1991: Lara goes to a lonely island to find the...,The mask of Tak 1991: Lara goes to a lonely is...,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
490,From Death Comes Life,Tomb Raider,"Lara has always been a people pleaser, but to ...",From Death Comes Life Lara has always been a p...,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [42]:
def clean_text(text):
    # remove backslash-apostrophe 
    text = re.sub("\'", "", text) 
    # remove everything except alphabets 
    text = re.sub("[^a-zA-Z]"," ",text) 
    # remove whitespaces 
    text = ' '.join(text.split()) 
    # convert text to lowercase 
    text = text.lower()

    return text

In [43]:
df_usable["combined_text"] = df_usable["combined_text"].apply(clean_text)

In [44]:
from nltk.corpus import stopwords

In [45]:
stop_words = set(stopwords.words('english'))

# function to remove stopwords
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)

df_usable["combined_text"] = df_usable["combined_text"].apply(remove_stopwords)

In [46]:
df_usable.shape
# Specify the output file
output_file = "filtered_usable_df.csv"

# Save to CSV
df_usable.to_csv(output_file, sep=";", encoding="utf-8", index=False)  # Set index=False to avoid saving the index

print(f"DataFrame successfully written to {output_file}")

DataFrame successfully written to filtered_usable_df.csv


<h2> Build a model </h2>

In [47]:
#df_usable = pd.read_csv("filtered_usable_df.csv", sep=";")
#print(df_usable.shape)

In [48]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report

# Assuming df_usable is already loaded

# Step 1: Features and target variables
X = df_usable['combined_text'].str.lower()  # Text features
y = df_usable[['Friendship', 'Romance', 'Drama', 'Adventure', 'Family', 'Hurt-Comfort', 
               'Suspense', 'Fantasy', 'Sci-Fi', 'Humor', 'Angst', 'Supernatural', 'Tragedy', 
               'Spiritual', 'Poetry', 'Parody', 'Mystery', 'Horror', 'Crime', 'Western']]  # Binary columns for genres

# Step 2: Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [49]:
# Step 3: Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer(max_features=10000, max_df=0.8)  # Limit features for simplicity
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [50]:
import joblib

# Save the vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [51]:
#!pip install scikit-multilearn

In [52]:
y_train

Unnamed: 0,Friendship,Romance,Drama,Adventure,Family,Hurt-Comfort,Suspense,Fantasy,Sci-Fi,Humor,Angst,Supernatural,Tragedy,Spiritual,Poetry,Parody,Mystery,Horror,Crime,Western
5556194,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
219063,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6107821,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1454461,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
165253,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1606838,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2239331,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3186847,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6272778,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [53]:
from lightgbm import LGBMClassifier
from sklearn.multiclass import OneVsRestClassifier

In [54]:
# Step 4: Train a multi-label classification model

#LightBGM
lgb_model = LGBMClassifier(n_estimators=1000, max_depth=20, random_state=42, verbose=0)

#model = MultiOutputClassifier(LogisticRegression(max_iter=1000, random_state=42))
#model = MultiOutputClassifier(lgb_model)


model = OneVsRestClassifier(lgb_model) # Handles correlations better

#model.fit(X_resampled, y_resampled)
model.fit(X_train_tfidf, y_train)



In [55]:
from sklearn.metrics import hamming_loss, classification_report

In [56]:
# Step 5: Evaluate the model
#y_pred = model.predict(X_test_tfidf)

# Predict probabilities
y_prob = model.predict_proba(X_test_tfidf)

# Set a custom threshold
threshold = 0.5
y_pred = (y_prob >= threshold).astype(int)


# Step 6: Classification report
print("Hamming Loss:", hamming_loss(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=y.columns))

Hamming Loss: 0.06662438244933232
Classification Report:
              precision    recall  f1-score   support

  Friendship       0.62      0.04      0.07     83008
     Romance       0.76      0.76      0.76    508224
       Drama       0.58      0.02      0.03    166975
   Adventure       0.68      0.33      0.44    121646
      Family       0.63      0.13      0.21     58016
Hurt-Comfort       0.60      0.06      0.10    100653
    Suspense       0.44      0.00      0.00     18708
     Fantasy       0.48      0.02      0.03     25656
      Sci-Fi       0.54      0.10      0.17     15150
       Humor       0.73      0.29      0.42    198557
       Angst       0.65      0.11      0.18    126917
Supernatural       0.48      0.04      0.08     25582
     Tragedy       0.51      0.04      0.07     36697
   Spiritual       0.27      0.02      0.03      4560
      Poetry       0.72      0.50      0.59     13646
      Parody       0.68      0.18      0.28     16991
     Mystery       0.54 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [57]:
from sklearn.preprocessing import MultiLabelBinarizer

In [58]:
# Define a custom text input for prediction
custom_text = ["A Love Divided Two people from rival families fall in love, but their relationship is tested by betrayal, secrets, and the expectations of their families.",
              "Once upon a time there was a boy called harry potter who found out he has a love life"] #["Harry the wizard, goes on funny"]

custom_filtered = []
for i in range(len(custom_text)):
    custom_filtered.append(remove_stopwords(clean_text(custom_text[i])))

# Transform the custom text using the trained TF-IDF vectorizer
custom_text_tfidf = vectorizer.transform(custom_filtered)

# Predict the genres for the custom text
custom_pred = model.predict(custom_text_tfidf)

#custom_pred = (custom_pred >= 0.3).astype(int)

# Display the predicted genres
for i, text in enumerate(custom_text):
    predicted_genres = [y.columns[j] for j, value in enumerate(custom_pred[i]) if value == 1]
    print(f"Custom Text {i + 1}: {text}")
    print("Predicted Genres:", predicted_genres)
    print()

Custom Text 1: A Love Divided Two people from rival families fall in love, but their relationship is tested by betrayal, secrets, and the expectations of their families.
Predicted Genres: ['Romance', 'Drama']

Custom Text 2: Once upon a time there was a boy called harry potter who found out he has a love life
Predicted Genres: ['Romance']



In [59]:
from joblib import load
from joblib import dump

In [60]:
# Save the model
dump(model, 'LightGBM_good_3.joblib')

['LightGBM_good_3.joblib']

In [61]:
#model = load('LightGBM_good_1.joblib')

In [62]:
import joblib

# Save the vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']