In [1]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [2]:
# Connect to the SQLite database (since it's in the same folder, you can just use the filename)
db_path = 'metadata-110mil.sqlite'
conn = sqlite3.connect(db_path)

# Check available tables in the database (optional)
tables = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", conn)
print("Available tables:", tables)

# Specify the table you want to load
table_name = 'metadata'  # Replace with your actual table name

# Load the table into a DataFrame
df = pd.read_sql_query(f"SELECT * FROM {table_name};", conn)

# Close the database connection
conn.close()

# Display the first few rows of the DataFrame
df.head()


Available tables:        name
0  metadata


Unnamed: 0,Path,Title,Author,Category,Genre,Language,Status,Published,Updated,Packaged,Rating,Chapters,Words,Publisher,Story URL,Author URL,Summary,word_count,chapter_count,story_id
0,Fanfiction/Halo_ Mass Effect/Completed/Halo_ M...,The Distant Strains of Triumph,SpartanDelta2,"Halo, Mass Effect","Drama, Sci-Fi",English,Completed,2013-07-15,2013-07-15,2013-10-28 09:08:08,M,1,1180,www.fanfiction.net,http://www.fanfiction.net/s/9493934/1/,http://www.fanfiction.net/u/2727455/SpartanDelta2,"A project for my class, a challenge from my te...",1180,1,9493934
1,Fanfiction/Halo_ Mass Effect/Completed/Halo_ M...,Mass Effect: Beyond,Cunningham-Hughes,"Halo, Mass Effect","Adventure, Sci-Fi",English,Completed,2012-10-01,2012-10-01,2013-07-21 09:44:53,T,1,9971,www.fanfiction.net,http://www.fanfiction.net/s/8573953/1/,http://www.fanfiction.net/u/4262625/Cunningham...,A hardcore Mass Effect fan's retelling of the ...,9971,1,8573953
2,Fanfiction/Halo_ Mass Effect/Completed/Halo_ M...,Chaos Infinitium,Sysero of Cain,"Halo, Mass Effect","Adventure, Sci-Fi",English,Completed,2011-03-11,2011-11-17,2014-11-23 22:48:57,T,5,19447,www.fanfiction.net,https://www.fanfiction.net/s/6816070/1/,https://www.fanfiction.net/u/2362265/Sysero-of...,First Contact never seems to go well. But thin...,19447,5,6816070
3,Fanfiction/Halo_ Mass Effect/Completed/Halo_ M...,Tools of Defiance,Magisking,"Halo, Mass Effect","Angst, Tragedy",English,Completed,2013-12-22,2013-12-22,2014-02-07 23:48:12,T,1,997,www.fanfiction.net,https://www.fanfiction.net/s/9946669/1/,https://www.fanfiction.net/u/5244687/Magisking,A One-shot that takes place in the Defiance un...,997,1,9946669
4,Fanfiction/Halo_ Mass Effect/Completed/Halo_ M...,Mass Effect New Origins V2,erttheking,"Halo, Mass Effect","Romance, Sci-Fi",English,Completed,2010-10-05,2012-06-09,2014-05-25 15:31:13,T,109,442687,www.fanfiction.net,https://www.fanfiction.net/s/6376514/1/,https://www.fanfiction.net/u/1835782/erttheking,"On the dawn of the 27th century, the UNSC disc...",442687,109,6376514


In [3]:
test = df.iloc[0]
print(test)

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams

# Set the font to a universal one like Noto Sans
#rcParams['font.sans-serif'] = ['Noto Sans', 'DejaVu Sans', 'Arial Unicode MS']  # Ensure Noto Sans is installed
#rcParams['axes.unicode_minus'] = False  # Avoid issues with minus signs

# Load your data (assuming it's already in a DataFrame `df`)
# df = pd.read_csv('your_file.csv')  # Uncomment this if you're loading from a CSV file

# Split categories by commas and stack them to get a single column of all categories
#all_categories = df['Category'].str.split(',').explode()

# Strip any extra whitespace from each category (important if there are spaces after commas)
#all_categories = all_categories.str.strip()

# Count occurrences of each unique category
#category_counts = all_categories.value_counts()

# Filter categories to include only those with at least 500 occurrences
#filtered_category_counts = category_counts[category_counts >= 10000]

# Plot as a bar chart
#plt.figure(figsize=(20, 6))
#filtered_category_counts.plot(kind='bar', color='skyblue')
#plt.title('Distribution of Categories (at least 10000 entries)')
#plt.xlabel('Category')
#plt.ylabel('Count')
#plt.xticks(rotation=90, ha='right')
#plt.tight_layout()
#plt.show()


![category_distribution](category_distribution.png)

In [6]:
import matplotlib.pyplot as plt
import seaborn as sns

# Take a random sample of 100,000 rows (adjust the number as needed for performance)
#sample_df = df['word_count'].sample(100000, random_state=42)

#plt.figure(figsize=(10, 6))
#sns.histplot(sample_df, bins=50, kde=True)

#plt.xlabel("Word Count")
#plt.ylabel("Frequency")
#plt.title("Distribution of Word Count (Sampled)")

#plt.show()


![word distribution](word_distribution.png)

<h2> Cleaning and Formating Data: </h2>

In [9]:
df.info()

In [10]:
missing_all_values = df[df["Publisher"] == ""].value_counts()#.index.tolist()
len(missing_all_values) # TODO: Clean all that are missing. 79 missing is rows that dont hae any value beside path

79

In [11]:
rows_with_null = (df.isnull()).sum()
rows_with_null

Path             0
Title            0
Author           0
Category         0
Genre            0
Language         0
Status           0
Published        0
Updated          0
Packaged         0
Rating           0
Chapters         0
Words            0
Publisher        0
Story URL        0
Author URL       0
Summary          0
word_count       0
chapter_count    0
story_id         0
dtype: int64

In [12]:
rows_with_missing_values = (df.isnull() | (df == "")).any(axis=1).sum()
rows_with_missing_values

652658

In [13]:
rows_with_missing_values / len(df) * 100

9.734885904180304

In [14]:
all_missing = (df == "").sum()
all_missing

Path                  0
Title               142
Author               83
Category            120
Genre            652471
Language             79
Status               79
Published            81
Updated              81
Packaged             81
Rating               79
Chapters             79
Words                79
Publisher            79
Story URL            79
Author URL           79
Summary             210
word_count           79
chapter_count        79
story_id             79
dtype: int64

In [15]:
# Changing data type for chapter and word count into int type

df['word_count'] = pd.to_numeric(df['word_count'], errors='coerce').astype('Int64')
df['chapter_count'] = pd.to_numeric(df['chapter_count'], errors='coerce').astype('Int64')

In [16]:
#df_usable = df.copy(deep=True) 

# Remove redundant  and unimportant columns

df_usable = df.drop(columns=['Chapters', 'Words','Path','Story URL','Author URL'])

# remove rows where all values are missing (79 of those)

df_usable = df_usable[df_usable['word_count'] != '']

# All where summary, genre and category is missing

df_usable = df_usable[df_usable['Summary'] != '']
df_usable = df_usable[df_usable['Category'] != '']
df_usable = df_usable[df_usable['Genre'] != '']

# Removing extreme word count values  (100 < x < 2,000,000)

df_usable = df_usable[(df_usable['word_count'] > 100) & (df_usable['word_count'] < 2000000)]


In [17]:
df_usable = df_usable.drop(columns=['Packaged', 'Publisher'])

In [18]:
df_usable = df_usable.drop(columns=['Status', 'Published', 'Updated', 'Rating', 'story_id'])


In [19]:
df_usable=df_usable[(df_usable['Language'] == 'English')]

In [20]:
df_usable = df_usable.drop(columns=['Language'])

In [21]:
category_counts = df_usable["Category"].value_counts()

# Find categories with more than 1000 occurrences
categories_to_keep = category_counts[category_counts > 1000].index

# Filter the DataFrame
df_usable = df_usable[df_usable["Category"].isin(categories_to_keep)]

len(df_usable)

4599826

In [22]:
# Counting after cleaning all the missing
missing = (df_usable == "").sum()
missing

Title            27
Author            2
Category          0
Genre             0
Summary           0
word_count        0
chapter_count     0
dtype: Int64

In [23]:
df_usable.columns

Index(['Title', 'Author', 'Category', 'Genre', 'Summary', 'word_count',
       'chapter_count'],
      dtype='object')

In [24]:
len(df_usable)

4599826

In [25]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Ensure nltk resources are downloaded
import nltk
nltk.download('punkt')  # Tokenizer data
nltk.download('wordnet')  # Lemmatizer data
nltk.download('stopwords')  # Stopwords data
nltk.download('omw-1.4')  # WordNet data
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [26]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
custom_stop_words = set(stopwords.words('english')).union(ENGLISH_STOP_WORDS)

# Preprocessing function
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabet characters
    text = text.lower()  # Lowercase text
    tokens = word_tokenize(text)  # Tokenize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in custom_stop_words]  # Lemmatize and remove stopwords
    return tokens

# Apply preprocessing to Title and Summary
df_usable['processed_title'] = df_usable['Title'].apply(preprocess_text)
df_usable['processed_summary'] = df_usable['Summary'].apply(preprocess_text)

# Combine title and summary
df_usable['combined_text'] = df_usable['processed_title'] + df_usable['processed_summary']

# Group by Category and Genre
grouped_data = df_usable.groupby(['Category', 'Genre'])

# Collect unique words for each group
unique_words_by_group = defaultdict(list)

for (category, genre), group in grouped_data:
    all_words = [word for text in group['combined_text'] for word in text]
    unique_words = set(all_words)  # Find unique words
    unique_words_by_group[(category, genre)] = unique_words

# Convert to DataFrame for easier analysis
unique_words_df = pd.DataFrame([
    {'Category': k[0], 'Genre': k[1], 'Unique_Words': list(v)} 
    for k, v in unique_words_by_group.items()
])

In [27]:
unique_words_df

Unnamed: 0,Category,Genre,Unique_Words
0,.hack/SIGN,Adventure,"[angel, follows, try, salvation, played, reeni..."
1,.hack/SIGN,"Adventure, Angst","[chap, admit, real, deadly, fall, second, abus..."
2,.hack/SIGN,"Adventure, Drama","[angel, question, follows, try, including, fin..."
3,.hack/SIGN,"Adventure, Family","[edited, finished, preinfection, real, centric..."
4,.hack/SIGN,"Adventure, Fantasy","[angel, try, warp, finished, series, alongside..."
...,...,...,...
69989,xxxHOLiC,Supernatural,"[stalk, question, ramshackle, wish, karma, wak..."
69990,xxxHOLiC,"Supernatural, Suspense","[decide, affect, watanuki, dragged, coulda, li..."
69991,xxxHOLiC,"Supernatural, Tragedy","[love, open, want, wish, happy, nayuki, witch,..."
69992,xxxHOLiC,Suspense,"[unusual, tendency, lead, line, little, realit..."


In [28]:
unique_words_df[unique_words_df["Unique_Words"].apply(len) == 0]

Unnamed: 0,Category,Genre,Unique_Words
36275,Marching Band,"Angst, Mystery",[]


In [29]:
df_usable.shape
# Specify the output file
output_file = "filtered_usable_df.csv"

# Save to CSV
df_usable.to_csv(output_file, sep=";", encoding="utf-8", index=False)  # Set index=False to avoid saving the index

print(f"DataFrame successfully written to {output_file}")

DataFrame successfully written to filtered_usable_df.csv


In [30]:
len(df_usable)

4599826

In [31]:
output_file = "unique_words_df.csv"

# Save to CSV
unique_words_df.to_csv(output_file, sep=";", encoding="utf-8", index=False)  # Set index=False to avoid saving the index

print(f"DataFrame successfully written to {output_file}")

DataFrame successfully written to unique_words_df.csv


In [33]:
len(df_usable)

4599826

<h2> Build a model </h2>

In [None]:
df_usable2 = pd.read_csv("filtered_usable_df.csv", sep=";")
print(df_usable2.shape)

In [None]:
df_usable["Category"].value_counts()

In [None]:
df_usable["word_count"].sum()

In [None]:
numerical_columns = df[['word_count', 'chapter_count']].copy()

# Convert these columns to numeric (if needed)
numerical_columns = numerical_columns.apply(pd.to_numeric, errors='coerce')

# Drop any rows with NaN values in numerical columns to avoid calculation issues
numerical_columns = numerical_columns.dropna()

# Calculate the correlation matrix
correlation_matrix = numerical_columns.corr()

# Plot the correlation matrix
plt.figure(figsize=(8, 6))
plt.matshow(correlation_matrix, cmap='coolwarm', fignum=1)
plt.xticks(range(len(correlation_matrix.columns)), correlation_matrix.columns, rotation=90)
plt.yticks(range(len(correlation_matrix.columns)), correlation_matrix.columns)
plt.colorbar()
plt.title("Correlation Matrix", pad=20)
plt.show()

correlation_matrix