# 1. Imports

In [22]:
pip install googletrans==3.1.0a0



In [23]:
pip install tensorflow



In [20]:
import re
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from googletrans import Translator
from collections import Counter


nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# 2. Data set load

In [16]:
# read the training file and extract genre and plot

input_file = '/content/sample_data/train.txt'

genres = []
plots = []

with open(input_file, 'r', encoding='utf-8') as infile:
    for line in infile:
        # Split the line by tabs
        parts = line.split('\t')

        # Extract genre and plot
        genres.append(parts[2])
        plots.append(parts[4])

# 3. Train test split and Data augmentation

In [17]:
X_train, X_test, Z_train, Z_test = train_test_split(plots, genres, test_size=0.1, random_state=33)

translator = Translator()
aug_X_train = []
aug_Z_train = []
drama_counter = 0
counter = 0
for i in range(len(X_train)):
        counter += 1
        if counter % 1000 == 0:
          print (counter)
        # Split the line by tabs

        if Z_train[i] == 'sci-fi':
            # Perform back translation
            translated = translator.translate(X_train[i], src='en', dest='es').text
            back_translated = translator.translate(translated, src='es', dest='en').text

            # Perform synonym replacement
            translated2 = translator.translate(X_train[i], src='en', dest='fr').text
            back_translated2 = translator.translate(translated2, src='fr', dest='en').text

            # Append plots
            aug_X_train.append(X_train[i])
            aug_Z_train.append(Z_train[i])
            aug_X_train.append(back_translated)
            aug_Z_train.append(Z_train[i])
            aug_X_train.append(back_translated2)
            aug_Z_train.append(Z_train[i])


        elif Z_train[i] in {'crime', 'animation'}:
            # Perform back translation
            translated = translator.translate(X_train[i], src='en', dest='es').text
            back_translated = translator.translate(translated, src='es', dest='en').text

            # Append both plots to the specified file
            aug_X_train.append(X_train[i])
            aug_Z_train.append(Z_train[i])
            aug_X_train.append(back_translated)
            aug_Z_train.append(Z_train[i])


        elif (Z_train[i] == 'drama'):
            if (drama_counter < 1000):
                drama_counter += 1
                aug_X_train.append(X_train[i])
                aug_Z_train.append(Z_train[i])

        else:
            aug_X_train.append(X_train[i])
            aug_Z_train.append(Z_train[i])

1000
2000
3000
4000
5000
6000
7000


# 4. Pre processing

In [18]:
#porter_stemmer=PorterStemmer()
lemmatizer = WordNetLemmatizer()

#Function to get WordNet POS tag
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun if no tag match

#Function to lemmatize sentence with POS tagging
def processSentence(s):
    words = re.split("\s+", s.lower())  # Lowercasing and tokenizing
    pos_tags = nltk.pos_tag(words)  # Get POS tags for each word
    lemmed_words = [lemmatizer.lemmatize(w, get_wordnet_pos(pos)) for w, pos in pos_tags]
    return ' '.join(lemmed_words)

def filter_stopwords(tokens):
    aux = [word for word in tokens if word.isalpha() and word not in stop_words] # watch out for isalpha()
    return ' '.join(aux)

#lemmatization and Lowercasing
lemmed_train_plots = [processSentence(plot) for plot in aug_X_train]
lemmed_test_plots = [processSentence(plot) for plot in X_test]
lemmed_og_train_plots = [processSentence(plot) for plot in X_train]


#Tokenizing
tokenized_train_plots = [nltk.wordpunct_tokenize(sp) for sp in lemmed_train_plots]
tokenized_test_plots = [nltk.wordpunct_tokenize(sp) for sp in lemmed_test_plots]
tokenized_og_train_plots = [nltk.wordpunct_tokenize(sp) for sp in lemmed_og_train_plots]

stop_words = set(stopwords.words('english'))

filtered_train_plots = [filter_stopwords(tp) for tp in tokenized_train_plots]
filtered_test_plots = [filter_stopwords(tp) for tp in tokenized_test_plots]
filtered_og_train_plots = [filter_stopwords(tp) for tp in tokenized_og_train_plots]



# 5. Tokenization

In [39]:

# Initialize the Tokenizer
tokenizer_og = Tokenizer(num_words=5000)  # Limit to top 5000 most frequent words
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(filtered_train_plots)
tokenizer.fit_on_texts(filtered_og_train_plots)


# Convert text to sequences of integers
X = tokenizer.texts_to_sequences(filtered_train_plots)
X_og = tokenizer.texts_to_sequences(filtered_og_train_plots)

# 6. Padding Sequences

In [40]:
from keras.preprocessing.sequence import pad_sequences

# Pad the sequences (assuming max length of 100 words per plot)
X = pad_sequences(X, maxlen=100)
X_og = pad_sequences(X_og, maxlen=100)

# 7. Encoding the Labels

In [41]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

# Convert genres to numerical labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(aug_Z_train)

# Optionally convert to one-hot encoded format
y = to_categorical(y)


# 8. Building LSTM

In [42]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential()
model_og = Sequential()

# Embedding layer (input_dim: vocabulary size, output_dim: embedding dimensions)
model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
model_og.add(Embedding(input_dim=5000, output_dim=128, input_length=100))

# LSTM layer (return_sequences=False for classification)
model.add(LSTM(units=128))
model_og.add(LSTM(units=128))


# Optional Dropout layer to prevent overfitting
model.add(Dropout(0.5))
model_og.add(Dropout(0.5))

# Fully connected layer for classification (adjust units for number of classes)
model.add(Dense(units=9, activation='softmax'))  # Assuming 8 genres
model_og.add(Dense(units=9, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_og.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])





# 9. Train the model

In [43]:
# Train the model
model.fit(X, y, epochs=3, batch_size=32, validation_split=0.2)
model_og.fit(X_og, y, epochs=3, batch_size=32, validation_split=0.2)


Epoch 1/3
[1m202/202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 183ms/step - accuracy: 0.1910 - loss: 2.0678 - val_accuracy: 0.2192 - val_loss: 1.8716
Epoch 2/3
[1m202/202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 181ms/step - accuracy: 0.4084 - loss: 1.4841 - val_accuracy: 0.3969 - val_loss: 1.5632
Epoch 3/3
[1m202/202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 180ms/step - accuracy: 0.6687 - loss: 0.9126 - val_accuracy: 0.5183 - val_loss: 1.4423
Epoch 1/3
[1m181/181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 176ms/step - accuracy: 0.1762 - loss: 2.1833 - val_accuracy: 0.0000e+00 - val_loss: 2.2348
Epoch 2/3
[1m181/181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 181ms/step - accuracy: 0.1897 - loss: 2.1312 - val_accuracy: 0.0656 - val_loss: 2.2659
Epoch 3/3
[1m181/181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 179ms/step - accuracy: 0.4076 - loss: 1.7946 - val_accuracy: 0.0981 - val_loss: 2.4623
[1m26/26[0

NameError: name 'labels' is not defined

In [46]:
# Pre processed
test_tokenized = tokenizer.texts_to_sequences(filtered_test_plots)
test_padded = pad_sequences(test_tokenized, maxlen=100)

labels = np.unique(genres).tolist()

prediction = model.predict(test_padded)
genre_index = prediction.argmax(axis=-1)

predicted_genre = label_encoder.inverse_transform(genre_index)

print("==== Pre processed ====")

print(classification_report(y_pred=predicted_genre, y_true = Z_test, labels = labels, zero_division=1.))

# Original
test_tokenized_og = tokenizer.texts_to_sequences(filtered_test_plots)
test_padded_og = pad_sequences(test_tokenized_og, maxlen=100)


prediction_og = model_og.predict(test_padded)
genre_index_og = prediction_og.argmax(axis=-1)

predicted_genre_og = label_encoder.inverse_transform(genre_index_og)

print("==== Original ====")

print(classification_report(y_pred=predicted_genre_og, y_true = Z_test, labels = labels, zero_division=1.))


[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 43ms/step
==== Pre processed ====
              precision    recall  f1-score   support

      action       0.45      0.45      0.45       113
   animation       0.73      0.53      0.62        60
      comedy       0.40      0.32      0.36       119
       crime       0.35      0.42      0.38        57
       drama       0.40      0.42      0.41       165
      horror       0.53      0.74      0.62       101
     romance       0.48      0.44      0.46        95
      sci-fi       0.56      0.26      0.36        19
     western       0.78      0.78      0.78        76

    accuracy                           0.49       805
   macro avg       0.52      0.49      0.49       805
weighted avg       0.49      0.49      0.49       805

[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 92ms/step
==== Original ====
              precision    recall  f1-score   support

      action       0.08      0.06      0.07      