In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from joblib import dump, load
import pandas as pd
import spacy
import numpy as np

In [11]:
nlp = spacy.load("en_core_web_sm")

In [52]:
# Preprocessing function
# Define a function to process text by removing stopwords, punctuation, and lemmatizing
def preprocess(text):
    docs = nlp(text.lower())  # Convert to lowercase
    tokens = [
        token.lemma_ for token in docs 
        if not token.is_stop and not token.is_punct and token.is_alpha
    ]
    return " ".join(tokens)

In [61]:
# Load dataset
df = pd.read_csv("assignment_1.4.csv")

# Fix capitalization & whitespace
df['genre'] = df['genre'].str.strip().str.lower()

# Apply encoding safely
df['genre_numerical'] = df['genre'].apply(lambda x: 0 if x == 'horror' else (1 if x == 'romance' else np.nan))

# Drop NaN rows if they exist
df = df.dropna(subset=['genre_numerical'])

# Convert genre_numerical to integer
df['genre_numerical'] = df['genre_numerical'].astype(int)

# Verify Encoding
print(df[['genre', 'genre_numerical']].value_counts())




genre    genre_numerical
horror   0                  672
romance  1                  672
Name: count, dtype: int64


In [62]:
# Apply preprocessing
df['preprocessed_description'] = df['description'].apply(preprocess)

# Print first five rows to verify preprocessing
print(df.head())

     genre                                        description  \
0   horror   When six friends fly off on a weekend getaway...   
1   horror   The story is about a young girl who was touch...   
2  romance   A young woman named Anna has always longed fo...   
3   horror   A London couple moves to a large country hous...   
4   horror   In a small college in North Carolina, only a ...   

   genre_numerical                           preprocessed_description  
0                0  friend fly weekend getaway suddenly plague eng...  
1                0  story young girl touch spirit cause death try ...  
2                1  young woman name anna long love fail relations...  
3                0  london couple move large country house fresh s...  
4                0  small college north carolina select student le...  


In [63]:
# Split data into train and test sets; condider X as `preprocessed_description` column and Y as `genre_numerical` column

# code goes here
X = df['preprocessed_description']
Y = df['genre_numerical']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [64]:
# Create TF-IDF vectors
tfidf_vectorizer = TfidfVectorizer()
# create vectors of x train using fit_transform function.
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# create vectors of x test using transform function as we have already fitted the data.
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [65]:
# print length of vocabulary

# code goes 
print(len(tfidf_vectorizer.vocabulary_))

10274


In [66]:
# print size of y train
# code goes here
print(Y_train.size)

1075


In [67]:
# print size of X_train_tfidf
# code goes here
print(X_train_tfidf.shape)

(1075, 10274)


In [68]:
# Calculate mean vectors for each genre using X_train_tfidf
# Ensure Y_train is a NumPy array for proper indexing
Y_train = np.array(Y_train)

# Create boolean masks
horror_indices = Y_train == 0  # Horror is mapped to 0
romance_indices = Y_train == 1  # Romance is mapped to 1

# Convert sparse matrix to dense NumPy array before applying mean operation
horror_mean_vector = np.mean(X_train_tfidf[horror_indices].toarray(), axis=0)
romance_mean_vector = np.mean(X_train_tfidf[romance_indices].toarray(), axis=0)

In [69]:
# Train Naive Bayes classifier
clf = MultinomialNB()
# fit the data in classifier
clf.fit(X_train_tfidf, Y_train)

In [70]:
# Predict and evaluate the classifier using classification report
clf_predictions = clf.predict(X_test_tfidf)
# code goes here
print(classification_report(Y_test, clf_predictions))

              precision    recall  f1-score   support

           0       0.93      0.94      0.93       144
           1       0.93      0.91      0.92       125

    accuracy                           0.93       269
   macro avg       0.93      0.93      0.93       269
weighted avg       0.93      0.93      0.93       269



In [71]:
# Prediction function

# Prediction function (Fill this in)
def predict(user_input):
    # Preprocess the input text
    preprocessed_input = preprocess(user_input)
    # Convert the preprocessed input into TF-IDF vector
    input_vector = tfidf_vectorizer.transform([preprocessed_input])

    # Predict the genre using the trained model
    prediction = clf.predict(input_vector)

    # Return the genre
    return "horror" if prediction == 0 else "romance"

In [72]:
# Show two predictions
print(predict("A scary vampire was sucking blood"))
print(predict("A beautiful Love story"))

horror
romance


In [73]:
# Save the model, vectorizer, and mean vectors separately
dump(clf, "model.joblib")
# code goes here
dump(tfidf_vectorizer, "vectorizer.joblib")

dump(horror_mean_vector, "horror_mean_vector.joblib")
dump(romance_mean_vector, "romance_mean_vector.joblib")

['romance_mean_vector.joblib']

In [77]:
# Load the saved model, vectorizer

model = load('model.joblib')
vectorizer = load('vectorizer.joblib')

In [78]:
# Prediction function for saved model

def predict_from_loaded_model(user_input):
   # Preprocess the input text
   preprocessed_input = preprocess(user_input)

   # Convert the preprocessed input into TF-IDF vector
   input_vector = vectorizer.transform([preprocessed_input])

    # Predict the genre using the trained model
   prediction = model.predict(input_vector)

    # Return the genre
   return "horror" if prediction == 0 else "romance"

In [79]:
# Show two predictions

print(predict_from_loaded_model("A scary vampire was sucking blood"))

horror
