In [2]:
!pip install simpletransformers



In [3]:
from transformers import logging

logging.set_verbosity_warning()

## Dataset cleaning

In [4]:
import pandas as pd

df = pd.read_csv('../input/movies-yts/movie_plots_yts.csv',index_col=0)
df.dropna(subset=['Plot'], inplace=True)
df.drop_duplicates(subset=['Title','Plot'], inplace=True)

In [5]:
df.head()

Unnamed: 0,Title,Plot,Genres
0,Who is Amos Otis?,"After assassinating the President, Amos Otis p...",Drama
1,Wheel of Time,Wheel of Time is Werner Herzog's photographed ...,Documentary
3,'B' Girl Rhapsody,Burlesque beauties performing their signature ...,Drama
5,The Brass Bottle,After being released from his bottle by Harold...,"Comedy, Fantasy"
6,The Morning After,The Morning After is a feature film that consi...,"Comedy, Drama"


In [6]:
def prep_genre(genre):
    
    dict_genres = {
        "Musical":"Music",
        "Crime":"Thriller",
        "Film-Noir":"Thriller",
        "Biography":"Documentary",
        "Sci-Fi":"Sci-Fi & Fantasy",
        "Fantasy":"Sci-Fi & Fantasy",
        "Action":"Action & Adventure",
        "Adventure":"Action & Adventure"
    }
        
    genre = genre.replace(" ","")
    
    if len(genre.split(",")) > 0:
        genres = genre.split(",")
        genres = ",".join(list(set([ dict_genres[genre] if genre in dict_genres.keys() else genre for genre in genres ])))
        
    return genres

In [7]:
df['Genres'] = df['Genres'].apply(prep_genre)
df['Genres']

0                                                    Drama
1                                              Documentary
3                                                    Drama
5                                  Comedy,Sci-Fi & Fantasy
6                                             Drama,Comedy
                               ...                        
38044                     Horror,Action & Adventure,Comedy
38045              Drama,Action & Adventure,Romance,Comedy
38046              Drama,Action & Adventure,Romance,Comedy
38047    Thriller,Sci-Fi & Fantasy,Horror,Action & Adve...
38048                            Action & Adventure,Comedy
Name: Genres, Length: 33808, dtype: object

## Preprocessing

In [8]:
from sklearn.preprocessing import MultiLabelBinarizer

X = df['Plot'].values
y = df['Genres'].apply(lambda x : x.split(','))

mlb = MultiLabelBinarizer()
yt = mlb.fit_transform(y)
# Getting a sense of how the tags data looks like
print(yt[23324])
print(mlb.inverse_transform(yt[23324].reshape(1,-1)))
print(mlb.classes_)

[1 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
[('Action & Adventure', 'Comedy')]
['Action & Adventure' 'Animation' 'Comedy' 'Documentary' 'Drama' 'Family'
 'History' 'Horror' 'Music' 'Romance' 'Sci-Fi & Fantasy' 'Sport'
 'Thriller' 'War' 'Western']


In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, yt, test_size=0.25, shuffle=True, random_state=42)

In [10]:
(train_inputs, validation_inputs,
 train_labels, validation_labels) = train_test_split(X_train, y_train,
                                                     random_state=42,
                                                     test_size=0.20)
train_df = pd.DataFrame({'text':train_inputs,'label':train_labels.tolist()})
validation_df = pd.DataFrame({'text':validation_inputs,'label':validation_labels.tolist()})
train_df,validation_df

(                                                    text  \
 0      Pre-teen Jeliza-Rose's parents are hopeless dr...   
 1      A determined prosecutor becomes consumed with ...   
 2      Nineteen-year old Leon returns home to take ca...   
 3      The story of high school soccer prodigy Sara D...   
 4      When Straight-A college student Jeff Chang's t...   
 ...                                                  ...   
 20279  The number One Cylon brothers Cavil organize a...   
 20280  Kevin is throwing a celebration party, but gue...   
 20281  While most of the residents of a small Argenti...   
 20282  A pair of weed-loving, free lance video journa...   
 20283  Straight Up: Helicopters in Action will take a...   
 
                                                label  
 0      [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0]  
 1      [1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]  
 2      [1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  
 3      [1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 

## Training

In [11]:
from simpletransformers.classification import MultiLabelClassificationModel, MultiLabelClassificationArgs

# Optional model configuration
model_args = MultiLabelClassificationArgs()
model_args.num_train_epochs = 4
model_args.overwrite_output_dir = True

n_labels = len(set([g for row in df.Genres for g in row.split(",")]))
n_labels

15

In [12]:
# Create a ClassificationModel
model = MultiLabelClassificationModel("bert", "bert-base-uncased",args=model_args,num_labels=n_labels)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultiLabelSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultiLabelSequenceClassification were not 

In [13]:
model.train_model(train_df,eval_df=validation_df,model_args=model_args)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/20284 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/2536 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/2536 [00:00<?, ?it/s]

  model.parameters(), args.max_grad_norm


Running Epoch 2 of 4:   0%|          | 0/2536 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/2536 [00:00<?, ?it/s]

(10144, 0.20033750665662017)

## Evaluation

In [14]:
test_df = pd.DataFrame({'text':X_test,'label':y_test.tolist()})

In [17]:
#results = model.evaluate(test_df,'test')
result, model_outputs, wrong_predictions = model.eval_model(
    test_df
)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/8452 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1057 [00:00<?, ?it/s]

In [29]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

y_pred = model_outputs.round()
print(y_pred[0])
print("Evaluation results:",result)
print("F1_score:",f1_score(y_test,y_pred, average="micro"))

[1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
Evaluation results: {'LRAP': 0.8383974248418663, 'eval_loss': 0.2676147881793096}
F1_score: 0.7264124923089978


## Predictions check

In [34]:
df_pred = pd.read_csv("../input/genre-classification-dataset-imdb/Genre Classification Dataset/test_data.txt",index_col=0, sep=':::', names=['Title','Plot'], header=None)
df_pred.head()

Unnamed: 0,Title,Plot
1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar..."
2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch..."
3,Off the Beaten Track (2010),One year in the life of Albin and his family ...
4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi..."
5,Er nu zhai (1955),Before he was known internationally as a mart...


In [108]:
film_plots = df_pred['Plot'].values[-7:].tolist()
preds, outputs = model.predict(film_plots)

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [109]:
for plot, genre in zip(film_plots, preds):
    genre = mlb.inverse_transform(np.array([genre]))
    print("Film plot --> {}\nGenre --> {}\n".format(plot, genre))

Film plot -->  Jump right in to drawing animals along with Chuck McLachlan. Designed to get you drawing now, no time is wasted on proportion or perspective; just straight to making some of your favorite animals come to life. Learn to draw chickens, birds, lions, hippos, giraffes, moose, and squirrels. Directly from the wild to your television. Journey with Chuck today!
Genre --> [('Action & Adventure', 'Documentary')]

Film plot -->  Slender Existence is the first-ever personal film about surviving anorexia nervosa. Filmmaker Laura Murray interweaves her own fractured recollections of starvation with those of her family and closest high school friend - all of whom rally to her support. This intimate and touching self-portrait shows a family dealing with a child's eating disorder - and that child growing into a healthy adult.
Genre --> [('Documentary',)]

Film plot -->  Covering multiple genres, Tales of Light & Dark is an anthology web series by Light & Dark Productions that explores t