<a href="https://colab.research.google.com/github/opsabarsec/NLP--film-genres-from-synopsis/blob/main/radix_challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Automatic assignment of genres from movie synopsis using supervised machine learning

## 1. Import libraries and load data

In [165]:
#packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# NLP libraries

from textblob import TextBlob, Word
import nltk
import re
import csv
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

# Deep learning libraries

from keras.models import Model
from keras_preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, Input, LSTM, GlobalMaxPool1D, Dropout
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score

In [166]:
# load data
train= pd.read_csv('./data/train.csv')
test=pd.read_csv('./data/test.csv')

## 2. DATA PREPARATION 

In [167]:
nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [168]:
# function for text cleaning 
def preprocess_text(text): #funsi yang berguna untuk  melakukan cleaning terhadap dataset
    text = text.lower() # lowercase
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub(r"\'\n", " ", text) #line breaks
    #text = re.sub(r"\'\xa0", " ", text) # xa0 Unicode representing spaces
    #text = re.sub('\s+', ' ', text) # one or more whitespace characters
    text = text.strip(' ') # spaces
    # remove backslash-apostrophe 
    text = re.sub("\'", "", text) 
    # remove everything except alphabets 
    text = re.sub("[^a-zA-Z]"," ",text) 
    #lemmatize and remove stopwords
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    text = ' '.join(no_stopword_text) 
        
    return text

train['clean_plot'] = train['synopsis'].apply(lambda x: preprocess_text(x))
test['clean_plot'] = test['synopsis'].apply(lambda x: preprocess_text(x))

In [169]:
def lemma(text): # Lemmatization of cleaned body (avoid redundancy and improve the accuracy of analysis.)
        sent = TextBlob(text)
        tag_dict = {"J": 'a', 
                    "N": 'n', 
                    "V": 'v', 
                    "R": 'r'}
        words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]    
        lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
        seperator=' '
        lemma = seperator.join(lemmatized_list) 
        return lemma

In [170]:
train['lemma'] = train['clean_plot'].apply(lambda x: lemma(x))
test['lemma'] = test['clean_plot'].apply(lambda x: lemma(x))

## 3. Variables preparation 

In [171]:

X_train = train['lemma']
X_test = test['lemma']    

### 3.1 Target variable one hot encoding

In [172]:
#apply the onehot transformation for the genres vector
y_train = train['genres']
one_hot = MultiLabelBinarizer() # encoder for the  tags 
y_onehot = one_hot.fit_transform(y_train.str.split(' ')) 
y_bin = pd.DataFrame(y_onehot, columns=one_hot.classes_ ) # transform it to Pandas object

In [173]:
# tokenize
max_features = 5000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train))
list_tokenized_train = tokenizer.texts_to_sequences(X_train)
list_tokenized_test = tokenizer.texts_to_sequences(X_test)
vocab_size = len(tokenizer.word_index) + 1 # Tambahan vocab_size

In [174]:
#fix max comment lenght 
maxlen = 200 # Diubah dari 100 jadi 200
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen, padding='post') # Ditambah padding post
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen, padding='post') # Ditambah padding post

## 4.The Model

In [175]:
#initialize parameters
inp = Input(shape=(maxlen, )) #maxlen defined earlier
embed_size = 128

In [176]:
# Full dirubah
# Neural network backbone
model = Sequential()
model.add(Embedding(vocab_size, 64, input_length=maxlen)) 
model.add(LSTM(64, return_sequences=True,name='lstm_layer'))
model.add(GlobalMaxPool1D())
model.add(Dropout(0.1))
model.add(Dense(50, activation="relu"))
model.add(Dense(len(y_bin.columns), activation="softmax"))

In [177]:
# build the model

model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

In [178]:
# train the model
batch_size = 32 # Diubah dari 16 ke 32
epochs = 10 # Diubah dari 3 ke 10
model.fit(X_t,y_onehot, batch_size=batch_size, epochs=epochs, validation_split=0.1)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1eb92ff1460>

## 5.The prediction

In [230]:
filename = f'model-{epochs}-{batch_size}.h5'
model.save(filename)

In [251]:
from keras.models import load_model
model = load_model(filename)
y_pred = model.predict(X_te, batch_size=batch_size, verbose=1)
print(y_pred.shape)

(5250, 19)


In [252]:
df_probs_all = pd.DataFrame(y_pred,columns=y_bin.columns)
df_probs_all.head()

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0.146796,0.092593,0.031915,0.004001,0.032126,0.012677,0.019039,0.111241,0.037293,0.0003587175,0.046427,0.007879,0.002079,0.033372,0.009807,0.284483,0.121613,0.00503,0.001269605
1,0.116461,0.017779,0.00662,0.000689,0.026571,0.035989,0.069362,0.093029,0.006255,0.0001405759,0.034881,0.003609,0.000981,0.037603,0.002486,0.392034,0.154,0.001432,7.904846e-05
2,4e-06,6e-06,2.1e-05,1e-06,2.1e-05,9e-06,0.999741,6.9e-05,2e-06,1.168255e-10,4e-06,8e-06,8e-05,2e-06,4e-06,2e-06,1e-06,2.5e-05,1.533174e-08
3,0.012232,0.002466,0.001057,0.00021,0.004618,0.000467,7.9e-05,0.002055,0.004178,2.10685e-06,0.883959,0.000142,1.1e-05,0.001706,0.000142,0.058418,0.028169,2.4e-05,6.521006e-05
4,0.016626,0.002418,0.000841,0.000345,0.035209,0.299958,0.000834,0.255778,0.000966,0.01421195,0.01331,9.6e-05,0.000384,0.080258,0.00948,0.006386,0.261924,0.000172,0.0008007935


In [195]:
def top_5_predictions(df):
    N = 5
    cols = df.columns[:-1].tolist()
    a = df[cols].to_numpy().argsort()[:, :-N-1:-1]
    c = np.array(cols)[a]
    d = df[cols].to_numpy()[np.arange(a.shape[0])[:, None], a]
    df1 = pd.DataFrame(c).rename(columns=lambda x : f'max_{x+1}_col')

    predicted_genres = df1["max_1_col"] + ', ' + df1["max_2_col"]+ ', ' +df1["max_3_col"]+ ', ' + df1["max_4_col"]+ ', '+df1["max_5_col"]
    return predicted_genres

In [183]:
pred_gen = top_5_predictions(df_probs_all)

In [184]:
submission = pd.DataFrame(data= {'movie_id':test.movie_id,'predicted_genres':pred_gen})

In [185]:
submission.head()

Unnamed: 0,movie_id,predicted_genres
0,10827,"Drama, Comedy, Thriller, Crime, War"
1,51768,"Sci-Fi, Action, Animation, Thriller, Mystery"
2,36064,"Documentary, Musical, Drama, IMAX, Comedy"
3,33763,"Horror, Fantasy, Thriller, Action, Drama"
4,1146,"Crime, Thriller, Drama, Mystery, Horror"


In [186]:
submission.to_csv('submission.csv',index=False)

In [253]:
new_synopsis_input = [input("Input the synopsis")]
synopsis = new_synopsis_input
new_synopsis_input = tokenizer.texts_to_sequences(new_synopsis_input)
new_synopsis_input = pad_sequences(new_synopsis_input, padding='post', maxlen=maxlen)
predict = model.predict(new_synopsis_input, batch_size=batch_size, verbose=1)
print(predict.shape)

result = pd.DataFrame(predict,columns=y_bin.columns)
print("The synopsis you entered is :\n", synopsis)
print("Genres : ", top_5_predictions(result))

(1, 19)
The synopsis you entered is :
 ['An Indian soldier is assigned to eliminate his former mentor and he must keep his wits about him if he is to be successful in his mission; when the two men collide, it results in a barrage of battles and bullets.']
Genres :  0    Action, Drama, Adventure, Thriller, Crime
dtype: object
