# Importing libraries and extracting data

In [246]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, f1_score
from sklearn.svm import SVC

books_df = pd.read_csv('../data/processed/Amazon_Books_Processed.csv', index_col=0)
books_df.dropna(inplace=True)

# Preparing trainnig and testing data

In [247]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=5000)
X = vectorizer.fit_transform(books_df['Title'])
Y = books_df['Main Genre'] + ', ' + books_df['Sub Genre']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

# Trainning and testing SVC model

In [248]:
svc_model = SVC(kernel='rbf', C=1, gamma=1)  
svc_model.fit(X_train, y_train)

accuracy = svc_model.score(X_test, y_test)
print(f"Model accuracy: {accuracy}")

svc_train_pred = svc_model.predict(X_train)
svc_test_pred = svc_model.predict(X_test)

f1 = f1_score(y_test, svc_test_pred, average='weighted')
recall = recall_score(y_test, svc_test_pred, average='weighted')

print(f"F1 Score: {f1}")
print(f"Recall: {recall}")

Model accuracy: 0.22778143515470706
F1 Score: 0.24180629537588444
Recall: 0.22778143515470706


# Testing model with input text

In [250]:

title = "Attack on Titan Vol. 3"
vectorized_title = vectorizer.transform([title])

predicted_genre = svc_model.predict(vectorized_title)

print(f"The predicted genre is: {predicted_genre[0]}")


The predicted genre is: Comics & Mangas, Mangas


# Save trained model and vectorizer

In [251]:
import joblib

joblib.dump(svc_model, '../models/svm_model.pkl')
joblib.dump(vectorizer, '../models/vectorizer.pkl')

['../models/vectorizer.pkl']