In [None]:
!pip install transformers

In [None]:
!pip install xformers

In [None]:
!pip install gradio

In [4]:
import torch
import pandas as pd
import numpy as np
import gradio as gr
import matplotlib.pyplot as plt
from tqdm import tqdm

from transformers import pipeline
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig

# Labeling

In [5]:
df = pd.read_csv('/content/films_data_to_label.csv')
df.dropna(inplace=True)
df.head(3)

Unnamed: 0,movieID,genders,lines,char_names,movie_title,genres,text_length
0,m0,"['m', 'f', 'm']","['Why?', 'Unsolved mystery. She used to be re...","['CAMERON', 'BIANCA', 'CAMERON']",10 things i hate about you,"['comedy', 'romance']",151
1,m0,"['f', 'm', 'f', 'm']","['do you listen to this crap?', 'What crap?', ...","['BIANCA', 'CAMERON', 'BIANCA', 'CAMERON']",10 things i hate about you,"['comedy', 'romance']",162
2,m0,"['f', 'f', 'f']","[""You're ruining my life' Because you won't b...","['BIANCA', 'KAT', 'BIANCA']",10 things i hate about you,"['comedy', 'romance']",190


In [6]:
df.shape

(9517, 7)

In [7]:
df['lines'] = df['lines'].apply(lambda x: eval(x))

In [8]:
df['genres'] = df['genres'].apply(lambda x: eval(x))

In [9]:
df['char_names'] = df['char_names'].apply(lambda x: eval(x))

In [10]:
df['genders'] = df['genders'].apply(lambda x: eval(x))
df['genders'] = df['genders'].apply(lambda x: x if '?' not in x else np.nan)

In [11]:
df.dropna(inplace=True)

In [12]:
df['genders'] = df['genders'].apply(lambda x: ['Man' if i in ['m', 'M'] else 'Woman' for i in x])

In [13]:
df_to_label = df.sample(300, random_state=42)
df_to_label.reset_index(inplace=True, drop=True)

In [14]:
df_to_label.head(3)

Unnamed: 0,movieID,genders,lines,char_names,movie_title,genres,text_length
0,m505,"[Man, Woman, Man, Woman, Man]","[And it's sexier in the dark., Dewey you and I...","[DEWEY, GALE, DEWEY, GALE, DEWEY]",scream 2,"[horror, mystery, thriller]",176
1,m426,"[Man, Woman, Man, Woman, Man]","[Plenty of time., What are these?, I don't kno...","[PETER, MAYA, PETER, MAYA, PETER]",lost souls,"[drama, horror, thriller, romance]",178
2,m455,"[Man, Man, Man, Man, Man, Man, Man, Man, Man]","['You Will Anderson?, Who're you?, I'm Duff., ...","[DUFF, WILL, DUFF, WILL, DUFF, WILL, DUFF, WIL...",nothing but a man,"[drama, romance]",192


In [16]:
# https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
model.to('cuda:0')

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [18]:
def convert_to_dataset_torch(data: str):
    input_ids = []
    attention_masks = []
    encoded_dict = tokenizer.encode_plus(data,
                                         max_length=256,
                                         padding='max_length',
                                         return_attention_mask=True,
                                         return_tensors='pt',
                                         truncation=True)

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    input_ids.to(dtype=torch.long)
    attention_masks.to(dtype=torch.long)

    return input_ids, attention_masks

In [19]:
preds_all = []
for i in tqdm(df_to_label['lines']):
    preds_dial = []
    for k in i:
        converted = convert_to_dataset_torch(k)
        preds = model(input_ids=converted[0].to('cuda:0'),
                      attention_mask=converted[1].to('cuda:0')).logits
        predictions = torch.argmax(torch.nn.Softmax(dim=1)(preds), axis=1)
        preds_dial.append(predictions.item())
    preds_all.append(preds_dial)

100%|██████████| 300/300 [00:42<00:00,  7.01it/s]


In [20]:
df_to_label['preds'] = pd.Series(preds_all)

In [21]:
df_labeled = df_to_label.drop(columns=['text_length'])

In [22]:
df_labeled.head(3)

Unnamed: 0,movieID,genders,lines,char_names,movie_title,genres,preds
0,m505,"[Man, Woman, Man, Woman, Man]","[And it's sexier in the dark., Dewey you and I...","[DEWEY, GALE, DEWEY, GALE, DEWEY]",scream 2,"[horror, mystery, thriller]","[2, 0, 0, 0, 0]"
1,m426,"[Man, Woman, Man, Woman, Man]","[Plenty of time., What are these?, I don't kno...","[PETER, MAYA, PETER, MAYA, PETER]",lost souls,"[drama, horror, thriller, romance]","[1, 1, 1, 1, 1]"
2,m455,"[Man, Man, Man, Man, Man, Man, Man, Man, Man]","['You Will Anderson?, Who're you?, I'm Duff., ...","[DUFF, WILL, DUFF, WILL, DUFF, WILL, DUFF, WIL...",nothing but a man,"[drama, romance]","[1, 1, 1, 1, 1, 1, 1, 1, 2]"


In [23]:
labels = ['negative','neutral','positive']
mapping = {i: k for i, k in enumerate(labels)}
df_to_label['preds'] = df_to_label['preds'].apply(lambda x: [*map(mapping.get, x)])

In [24]:
df_to_label

Unnamed: 0,movieID,genders,lines,char_names,movie_title,genres,text_length,preds
0,m505,"[Man, Woman, Man, Woman, Man]","[And it's sexier in the dark., Dewey you and I...","[DEWEY, GALE, DEWEY, GALE, DEWEY]",scream 2,"[horror, mystery, thriller]",176,"[positive, negative, negative, negative, negat..."
1,m426,"[Man, Woman, Man, Woman, Man]","[Plenty of time., What are these?, I don't kno...","[PETER, MAYA, PETER, MAYA, PETER]",lost souls,"[drama, horror, thriller, romance]",178,"[neutral, neutral, neutral, neutral, neutral]"
2,m455,"[Man, Man, Man, Man, Man, Man, Man, Man, Man]","['You Will Anderson?, Who're you?, I'm Duff., ...","[DUFF, WILL, DUFF, WILL, DUFF, WILL, DUFF, WIL...",nothing but a man,"[drama, romance]",192,"[neutral, neutral, neutral, neutral, neutral, ..."
3,m265,"[Woman, Woman]",[We can't. We haven't left the house since th...,"[BARBARA, LYDIA]",beetle juice,"[comedy, fantasy]",154,"[negative, negative]"
4,m565,"[Woman, Man, Woman, Man, Woman]",[Get back inside the town and see if the Engli...,"[JEANNE, AULON, JEANNE, AULON, JEANNE]",the messenger,"[drama, romance, war]",167,"[neutral, negative, neutral, positive, neutral]"
...,...,...,...,...,...,...,...,...
295,m14,"[Man, Man]",[George uh... you're gonna break his little ch...,"[SYKES, JETSON]",alien nation,"[crime, drama, sci-fi, thriller]",167,"[negative, negative]"
296,m441,"[Woman, Man]","[Now don't fuss Paul., Why would I run away? I...","[ANNIE, PAUL]",misery,[thriller],157,"[neutral, positive]"
297,m2,"[Man, Man]",[So tabloids don't have to do re enactments. ...,"[EMIL, EDDIE]",15 minutes,"[action, crime, drama, thriller]",152,"[neutral, negative]"
298,m202,"[Man, Man, Man, Man, Man, Man]",[I'm telling you baby you always double down o...,"[TRENT, MIKE, TRENT, MIKE, TRENT, MIKE]",swingers,"[comedy, drama]",189,"[neutral, neutral, positive, neutral, positive..."


In [25]:
df_to_label.to_csv('films_data_labeled.csv', index=False)

# Gradio visualization

In [26]:
df = pd.read_csv('/content/films_data_labeled.csv')

In [27]:
df

Unnamed: 0,movieID,genders,lines,char_names,movie_title,genres,text_length,preds
0,m505,"['Man', 'Woman', 'Man', 'Woman', 'Man']","[""And it's sexier in the dark."", 'Dewey you an...","['DEWEY', 'GALE', 'DEWEY', 'GALE', 'DEWEY']",scream 2,"['horror', 'mystery', 'thriller']",176,"['positive', 'negative', 'negative', 'negative..."
1,m426,"['Man', 'Woman', 'Man', 'Woman', 'Man']","['Plenty of time.', 'What are these?', ""I don'...","['PETER', 'MAYA', 'PETER', 'MAYA', 'PETER']",lost souls,"['drama', 'horror', 'thriller', 'romance']",178,"['neutral', 'neutral', 'neutral', 'neutral', '..."
2,m455,"['Man', 'Man', 'Man', 'Man', 'Man', 'Man', 'Ma...","[""'You Will Anderson?"", ""Who're you?"", ""I'm Du...","['DUFF', 'WILL', 'DUFF', 'WILL', 'DUFF', 'WILL...",nothing but a man,"['drama', 'romance']",192,"['neutral', 'neutral', 'neutral', 'neutral', '..."
3,m265,"['Woman', 'Woman']","[""We can't. We haven't left the house since t...","['BARBARA', 'LYDIA']",beetle juice,"['comedy', 'fantasy']",154,"['negative', 'negative']"
4,m565,"['Woman', 'Man', 'Woman', 'Man', 'Woman']",['Get back inside the town and see if the Engl...,"['JEANNE', 'AULON', 'JEANNE', 'AULON', 'JEANNE']",the messenger,"['drama', 'romance', 'war']",167,"['neutral', 'negative', 'neutral', 'positive',..."
...,...,...,...,...,...,...,...,...
295,m14,"['Man', 'Man']","[""George uh... you're gonna break his little c...","['SYKES', 'JETSON']",alien nation,"['crime', 'drama', 'sci-fi', 'thriller']",167,"['negative', 'negative']"
296,m441,"['Woman', 'Man']","[""Now don't fuss Paul."", ""Why would I run away...","['ANNIE', 'PAUL']",misery,['thriller'],157,"['neutral', 'positive']"
297,m2,"['Man', 'Man']","[""So tabloids don't have to do re enactments. ...","['EMIL', 'EDDIE']",15 minutes,"['action', 'crime', 'drama', 'thriller']",152,"['neutral', 'negative']"
298,m202,"['Man', 'Man', 'Man', 'Man', 'Man', 'Man']","[""I'm telling you baby you always double down ...","['TRENT', 'MIKE', 'TRENT', 'MIKE', 'TRENT', 'M...",swingers,"['comedy', 'drama']",189,"['neutral', 'neutral', 'positive', 'neutral', ..."


In [28]:
df['genders'] = df['genders'].apply(lambda x: eval(x))
df['lines'] = df['lines'].apply(lambda x: eval(x))
df['char_names'] = df['char_names'].apply(lambda x: eval(x))
df['genres'] = df['genres'].apply(lambda x: eval(x))
df['preds'] = df['preds'].apply(lambda x: eval(x))

In [29]:
all_genres = df.genres.explode().unique().tolist()

In [45]:
def get_stat_for_genre(genre):
    df['stat_genre'] = df['genres'].apply(lambda x: True if genre in x else False)
    df_final = df[df['stat_genre']]
    df_final = df_final.explode(['char_names', 'genders', 'preds'])
    df_final = df_final.groupby('genders')['preds'].value_counts(normalize=True)

    fig, axes = plt.subplots(1, 2, figsize=(10, 5))
    for ax, gender in zip(axes, df_final.index.get_level_values('genders').unique()):
        ax.pie(df_final[gender], labels=df_final[gender].index.tolist(), autopct='%1.1f%%', textprops={'color':"b"})
        ax.set_xlabel(gender, color='white')

    return fig

In [46]:
def statistics():
    gen = gr.Dropdown(all_genres, label='genre')
    return gr.Interface(
        fn=get_stat_for_genre,
        inputs=gen,
        outputs=gr.Plot(label='Statistics'),
        theme=gr.themes.Soft(),
    )

In [47]:
statistics().launch()

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

