In [1]:
import collections
import json
import re
import string

import nltk
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from gensim.parsing.preprocessing import (preprocess_string, remove_stopwords,
                                          strip_punctuation, strip_tags)
from nltk.corpus import stopwords
from nltk.util import ngrams  # function for making ngrams
from numpy import asarray, save, savez_compressed
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
import torch
torch.cuda.is_available()

In [3]:
mpstDF= pd.read_csv("mpst.csv")
mpstDF


Unnamed: 0,imdb_id,title,plot_synopsis,tags,split,synopsis_source
0,tt0057603,I tre volti della paura,Note: this synopsis is for the orginal Italian...,"cult, horror, gothic, murder, atmospheric",train,imdb
1,tt1733125,Dungeons & Dragons: The Book of Vile Darkness,"Two thousand years ago, Nhagruul the Foul, a s...",violence,train,imdb
2,tt0033045,The Shop Around the Corner,"Matuschek's, a gift store in Budapest, is the ...",romantic,test,imdb
3,tt0113862,Mr. Holland's Opus,"Glenn Holland, not a morning person by anyone'...","inspiring, romantic, stupid, feel-good",train,imdb
4,tt0086250,Scarface,"In May 1980, a Cuban man named Tony Montana (A...","cruelty, murder, dramatic, cult, violence, atm...",val,imdb
5,tt1315981,A Single Man,George Falconer (Colin Firth) approaches a car...,"romantic, queer, flashback",val,imdb
6,tt0249380,Baise-moi,Baise-moi tells the story of Nadine and Manu w...,"gothic, cruelty, violence, cult, revenge, sadist",train,wikipedia
7,tt0408790,Flightplan,Kyle Pratt (Jodie Foster) is a propulsion engi...,"mystery, suspenseful, action, murder, flashback",train,imdb
8,tt0021079,Little Caesar,Small-time Italian-American criminals Caesar E...,violence,train,imdb
9,tt1615065,Savages,The movie begins with a video being shot of me...,"revenge, neo noir, murder, violence, flashback",train,imdb


In [4]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can't", "can not", phrase)
    phrase = re.sub(r"couldn't", "could not", phrase)
    phrase = re.sub(r"wouldn't", "would not", phrase)
    phrase = re.sub(r"shouldn't", "should not", phrase)
    phrase = re.sub(r"don't", "do not", phrase)
    phrase = re.sub(r"doesn't", "does not", phrase)
    phrase = re.sub(r"haven't", "have not", phrase)
    phrase = re.sub(r"hasn't", "has not", phrase)
    phrase = re.sub(r"ain't", "not", phrase)
    phrase = re.sub(r"hadn't", "had not", phrase)
    phrase = re.sub(r"didn't", "did not", phrase)
    phrase = re.sub(r"wasn't", "was not", phrase)
    phrase = re.sub(r"aren't", "are not", phrase)
    phrase = re.sub(r"isn't", "is not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

# stop_words = stopwords.words('english')


In [5]:
mpstDF_processsed=mpstDF.copy()
# Type 1: Decontracted Text, The puncutation and stop words are still there
mpstDF_processsed["processed_synopsis_t1"]=mpstDF_processsed["plot_synopsis"].apply(lambda x: decontracted(" ".join(preprocess_string(x, [lambda x: x.lower(), strip_tags]))))
# Type 2 Decontracted Text Stop Words Removed
mpstDF_processsed["processed_synopsis_t2"]=mpstDF_processsed["plot_synopsis"].apply(lambda x: decontracted(" ".join(preprocess_string(x, [lambda x: x.lower(), strip_tags,remove_stopwords]))))

In [6]:
mpstDF_processsed

Unnamed: 0,imdb_id,title,plot_synopsis,tags,split,synopsis_source,processed_synopsis_t1,processed_synopsis_t2
0,tt0057603,I tre volti della paura,Note: this synopsis is for the orginal Italian...,"cult, horror, gothic, murder, atmospheric",train,imdb,note: this synopsis is for the orginal italian...,note: synopsis orginal italian release segment...
1,tt1733125,Dungeons & Dragons: The Book of Vile Darkness,"Two thousand years ago, Nhagruul the Foul, a s...",violence,train,imdb,"two thousand years ago, nhagruul the foul, a s...","thousand years ago, nhagruul foul, sorcerer re..."
2,tt0033045,The Shop Around the Corner,"Matuschek's, a gift store in Budapest, is the ...",romantic,test,imdb,"matuschek is, a gift store in budapest, is the...","matuschek is, gift store budapest, workplace a..."
3,tt0113862,Mr. Holland's Opus,"Glenn Holland, not a morning person by anyone'...","inspiring, romantic, stupid, feel-good",train,imdb,"glenn holland, not a morning person by anyone ...","glenn holland, morning person anyone is standa..."
4,tt0086250,Scarface,"In May 1980, a Cuban man named Tony Montana (A...","cruelty, murder, dramatic, cult, violence, atm...",val,imdb,"in may 1980, a cuban man named tony montana (a...","1980, cuban man named tony montana (al pacino)..."
5,tt1315981,A Single Man,George Falconer (Colin Firth) approaches a car...,"romantic, queer, flashback",val,imdb,george falconer (colin firth) approaches a car...,george falconer (colin firth) approaches car a...
6,tt0249380,Baise-moi,Baise-moi tells the story of Nadine and Manu w...,"gothic, cruelty, violence, cult, revenge, sadist",train,wikipedia,baise-moi tells the story of nadine and manu w...,baise-moi tells story nadine manu violent spre...
7,tt0408790,Flightplan,Kyle Pratt (Jodie Foster) is a propulsion engi...,"mystery, suspenseful, action, murder, flashback",train,imdb,kyle pratt (jodie foster) is a propulsion engi...,kyle pratt (jodie foster) propulsion engineer ...
8,tt0021079,Little Caesar,Small-time Italian-American criminals Caesar E...,violence,train,imdb,small-time italian-american criminals caesar e...,small-time italian-american criminals caesar e...
9,tt1615065,Savages,The movie begins with a video being shot of me...,"revenge, neo noir, murder, violence, flashback",train,imdb,the movie begins with a video being shot of me...,movie begins video shot men hands tied backs. ...


In [7]:
# from transformers import XLNetConfig, XLNetModel, XLNetTokenizer
from transformers import AutoModel,AutoConfig,AutoTokenizer


  from ._conv import register_converters as _register_converters


In [8]:
# xlnConfig= XLNetConfig()
# xlnModel = XLNetModel(xlnConfig)
from transformers import AutoModel,AutoConfig,AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('xlnet-base-cased')
# model = AutoModel.from_config(config)
model = AutoModel.from_pretrained('xlnet-base-cased')



In [9]:
def get_Encodings(text,tokenizer=tokenizer,model=model, verbose=False):
    if verbose:
        print("Text:")
        print(text[:20] + "....")
    # if the sequence lenght is too big we trim it to 250
    encoded = tokenizer.encode(text)
    
    if len(encoded)> 248:
        encoded = encoded[:248]+[4,3] # adding the encoding for special tokens by default
    if verbose:
        print("Encoded",encoded)
        print("length of Encoded", len(encoded))
    text_encoding_tensor=torch.tensor([encoded])
    if verbose:
        print("text_encoding_tensor:")
        print(text_encoding_tensor)
        print("shape:")
        print(text_encoding_tensor.shape)
    attention_mask_tensor= torch.tensor([[1]*text_encoding_tensor.shape[1]])
    if verbose:
        print("attention_mask_tensor:")
        print(attention_mask_tensor)
        print("shape:")
        print(attention_mask_tensor.shape)

    with torch.no_grad():
        outputs = model(text_encoding_tensor, attention_mask=attention_mask_tensor)
        if verbose:
            print("outputs:")
            print(outputs)
            print("Lenght of outputs",len(outputs))
            print("outputs[0]:")
            print(outputs[0])
            print("outputs[0].shape:")
            print(outputs[0].shape)
            print("outputs[1]:")
            print(outputs[1])
            print("Length Ooutputstput[1]:")
            print(len(outputs[1]))
            print("Sample from Output[1], first hidden layer:")
            print(outputs[1][0])
            print("Sample shape, first hidden layer")
            
    if verbose:
        print("getting the last tensor for XLNet")
        print(outputs[0].squeeze()[-1])
    return outputs[0].squeeze()[-1]

In [10]:
def getXLNetEmbeddings(df,column,verbose=False):
    embeddings = np.array(torch.tensor([np.array(get_Encodings(x,verbose=verbose)) for x in df[column]]))
    return embeddings

In [11]:
# THIS IS NORMAL TEST FUNCTION CREATED AND LATER MNERGED WITH THE getXLNetEmbeddings METHOD
# def getXLNetEmbeddings_testMode(df,column,verbose=False):
#     embeddings = np.array(torch.tensor([np.array(get_Encodings(x,verbose=verbose)) for x in df[column]]))
#     return embeddings

**Testing verbose mode for one Input**

In [12]:
sample_text="In May 1980, a Cuban man named Tony Montana (Al Pacino) claims asylum, in Florida, USA, and is in search of the \"American Dream\" after departing Cuba in the Mariel boatlift of 1980. When questioned by three tough-talking INS officials, they notice a tattoo on Tony's left arm of a black heart with a pitchfork through it, which identifies him as a hitman, and detain him in a camp called 'Freedomtown' with other Cubans, including Tony's best friend and former Cuban Army buddy Manolo \"Manny Ray\" Ribiera (Steven Bauer), under the local I-95 expressway while the government evaluates their visa petitions.After 30 days of governmental dithering and camp rumors, Manny receives an offer from the Cuban Mafia which he quickly relays to Tony. If they kill Emilio Rebenga (Roberto Contreras) a former aide to Fidel Castro who is now detained in Freedomtown, they will receive green cards. Tony agrees, and kills Rebenga during a riot at Freedomtown."
test_output=get_Encodings(sample_text)
print(test_output)

tensor([-2.1453e-01,  9.8637e-01, -1.3569e+00,  4.9745e-02, -2.0580e-01,
         2.0028e-01, -1.1084e+00, -2.4160e-01,  5.4918e-01,  6.1399e-01,
         2.2983e-01, -2.1084e+00,  4.5196e-01, -4.5227e-01,  1.9634e+00,
        -7.0501e-01,  7.6946e-01,  1.3087e+00,  4.6494e-01, -1.6070e+00,
         8.9788e-01, -1.1989e+00, -4.9773e-02, -7.3024e-01, -4.6916e-01,
         6.4692e-01, -1.2097e+00,  9.8949e-01, -1.8701e-01, -1.4202e+00,
         3.2042e-01, -2.4061e-02,  1.2414e+00,  7.3338e-02, -8.1281e-01,
        -9.1461e-01, -1.1381e+00,  3.1420e-01,  8.6016e-01,  1.2998e-02,
         5.2342e-01,  5.1991e-01, -1.0325e+00, -3.0562e-01, -2.7785e-01,
        -5.8735e-01,  1.1165e+00,  1.5297e+00, -6.8228e-01,  3.2196e-01,
        -3.0288e-01, -6.1153e-01, -1.8400e+00, -6.7133e-01,  6.4503e-01,
         7.3167e-01,  2.7735e-01, -1.7589e+00,  4.9479e-01,  1.1035e+00,
         2.3829e-03,  1.0375e+00, -2.1944e-01, -1.5443e+00,  4.3127e-01,
         1.3444e+00, -1.4885e+00,  4.8591e-01,  6.0

Now testing the helper functions for input from a dataframe to see if we get the correct desired output for our work

In [13]:
sampleDF=mpstDF_processsed[["processed_synopsis_t1","processed_synopsis_t2"]].head(3)
sampleDF["processed_synopsis_t1"]=sampleDF["processed_synopsis_t1"].apply(lambda x : x[:1000])
sampleDF["processed_synopsis_t2"]=sampleDF["processed_synopsis_t2"].apply(lambda x : x[:1000])
display(sampleDF)

Unnamed: 0,processed_synopsis_t1,processed_synopsis_t2
0,note: this synopsis is for the orginal italian...,note: synopsis orginal italian release segment...
1,"two thousand years ago, nhagruul the foul, a s...","thousand years ago, nhagruul foul, sorcerer re..."
2,"matuschek is, a gift store in budapest, is the...","matuschek is, gift store budapest, workplace a..."


In [14]:
# sample_embeddings=getXLNetEmbeddings_testMode(sampleDF,"processed_synopsis_t1",verbose=True)
sample_embeddings=getXLNetEmbeddings(sampleDF,"processed_synopsis_t1")
print("sample_embeddings.shape",sample_embeddings.shape)
print(sample_embeddings)


sample_embeddings.shape (3, 768)
[[-1.5467551   0.41237417 -1.3485146  ... -1.2315856   0.12427528
  -0.17078573]
 [-0.71728706  0.6656481   0.51812106 ... -0.5695448   2.5583012
  -1.0299454 ]
 [-1.7686012  -0.73729557 -1.3424098  ... -0.13613474 -0.68697166
   0.581978  ]]


HERE: make sure that the output is (3, 768). Where 3 is the number of texts we sent to the model and 768 is the output of embedding length for the model chosen.

Now We run the model for each text we have in our Dataset 

In [15]:
import time
start_time = time.time()

In [None]:
#For Type 1 Embeddings
xlnet_embeddings_t1=getXLNetEmbeddings(mpstDF_processsed,"processed_synopsis_t1")

In [None]:
print("Shape: ",xlnet_embeddings_t1.shape)
print("XL Embedding for Type 1")
print(xlnet_embeddings_t1)

In [None]:
np.savez("xl_embeddings_type1.npz",xlnet_embeddings_t1)

In [None]:
#For Type 2 Embeddings
xlnet_embeddings_t2=getXLNetEmbeddings(mpstDF_processsed,"processed_synopsis_t2")


In [None]:
print("Shape: ",xlnet_embeddings_t2.shape)
print("Embedding")
print(xlnet_embeddings_t2)

In [None]:
np.savez("xl_embeddings_type1.npz",xlnet_embeddings_t2)

Saved the embeddings in different files
Now saving them in the same file

In [None]:
np.savez("xl_embeddings.npz",t1=xlnet_embeddings_t1,t2=xlnet_embeddings_t2)

In [None]:
em_check=np.load("xl_embeddings.npz")
print("t1")
print(em_check["t1"])
print(em_check["t1"].shape)
print("t2")
print(em_check["t2"])
print(em_check["t2"].shape)


In [None]:
print("--- %s seconds ---" % (time.time() - start_time))