### Logistic Regression

In [11]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
nlp = spacy.load('en_core_web_sm')

In [62]:
stopwords_list = stopwords.words('english')
punctuation_list = list(string.punctuation)
numeric_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
removed = stopwords_list + punctuation_list + numeric_list
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = re.sub('\d+', ' ', text)
    text = text.strip(' ')
    text = word_tokenize(text)
    text = ' '.join(text)
    text = nlp(text)
    text = [w.lemma_ for w in text]
    text = [w for w in text if w not in stopwords_list]
    text = ' '.join(text)
    return text

In [7]:
# Define a dictionary mapping numerical labels to categories
label_mapping_logreg = {1: 'Romance', 2: 'Horror', 3: 'Comedy', 4: 'Action'}

In [65]:
import pickle, os
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the model from the pickle file
model_file_path = os.path.join('Model', 'LogisticRegression.pickle')

with open(model_file_path, 'rb') as file:
    model = pickle.load(file)

# Load the vectorizer used during training
vectorizer_file_path = os.path.join('Model', 'vectorizer.pickle')
with open(vectorizer_file_path, 'rb') as file:
    vectorizer = pickle.load(file)

text = input("Type your movie description here (Logistic Regression model).")

X_new = clean_text(text)
X_new_transformed = vectorizer.transform([X_new])

# Make predictions
predicted_label = model.predict(X_new_transformed)[0]  # Assuming X_new_transformed is a single sample
predicted_category = label_mapping_logreg.get(predicted_label)

# Print the predicted category
print("Predicted category:", predicted_category)


Predicted category: Action


### BERT

In [43]:
# Define a dictionary mapping numerical labels to categories
label_mapping_bert = {0: 'Romance', 1: 'Horror', 2: 'Comedy', 3: 'Action'}

In [52]:
import tensorflow as tf
from transformers import TFBertForSequenceClassification, BertTokenizer

# Load pre-trained BERT model and tokenizer
modelBERT = TFBertForSequenceClassification.from_pretrained('Model/BERTmodel')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some layers from the model checkpoint at Model/BERTmodel were not used when initializing TFBertForSequenceClassification: ['dropout_39']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at Model/BERTmodel.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [66]:
text = input("Type your movie description here (BERT model).")

textBERT = clean_text(text)

# Tokenize input text
tokens = tokenizer.encode_plus(textBERT, add_special_tokens=True, max_length=128, padding='max_length', truncation=True, return_tensors='tf')

# Make predictions
logits = modelBERT(tokens.input_ids, attention_mask=tokens.attention_mask)[0]

# Convert logits to probabilities
probabilities = tf.nn.softmax(logits, axis=-1)

# Get predicted class
predicted_class = tf.argmax(probabilities, axis=1).numpy()[0]
predicted_category = label_mapping_bert.get(predicted_class)

print("Predicted class:", predicted_category)


Predicted class: Romance
