In [1]:
import collections
import json
import re
import string

import nltk
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from gensim.parsing.preprocessing import (preprocess_string, remove_stopwords,
                                          strip_punctuation, strip_tags)
from nltk.corpus import stopwords
from nltk.util import ngrams  # function for making ngrams
from numpy import asarray, save, savez_compressed
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
mpstDF= pd.read_csv("mpst.csv")
mpstDF


Unnamed: 0,imdb_id,title,plot_synopsis,tags,split,synopsis_source
0,tt0057603,I tre volti della paura,Note: this synopsis is for the orginal Italian...,"cult, horror, gothic, murder, atmospheric",train,imdb
1,tt1733125,Dungeons & Dragons: The Book of Vile Darkness,"Two thousand years ago, Nhagruul the Foul, a s...",violence,train,imdb
2,tt0033045,The Shop Around the Corner,"Matuschek's, a gift store in Budapest, is the ...",romantic,test,imdb
3,tt0113862,Mr. Holland's Opus,"Glenn Holland, not a morning person by anyone'...","inspiring, romantic, stupid, feel-good",train,imdb
4,tt0086250,Scarface,"In May 1980, a Cuban man named Tony Montana (A...","cruelty, murder, dramatic, cult, violence, atm...",val,imdb
5,tt1315981,A Single Man,George Falconer (Colin Firth) approaches a car...,"romantic, queer, flashback",val,imdb
6,tt0249380,Baise-moi,Baise-moi tells the story of Nadine and Manu w...,"gothic, cruelty, violence, cult, revenge, sadist",train,wikipedia
7,tt0408790,Flightplan,Kyle Pratt (Jodie Foster) is a propulsion engi...,"mystery, suspenseful, action, murder, flashback",train,imdb
8,tt0021079,Little Caesar,Small-time Italian-American criminals Caesar E...,violence,train,imdb
9,tt1615065,Savages,The movie begins with a video being shot of me...,"revenge, neo noir, murder, violence, flashback",train,imdb


In [3]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can't", "can not", phrase)
    phrase = re.sub(r"couldn't", "could not", phrase)
    phrase = re.sub(r"wouldn't", "would not", phrase)
    phrase = re.sub(r"shouldn't", "should not", phrase)
    phrase = re.sub(r"don't", "do not", phrase)
    phrase = re.sub(r"doesn't", "does not", phrase)
    phrase = re.sub(r"haven't", "have not", phrase)
    phrase = re.sub(r"hasn't", "has not", phrase)
    phrase = re.sub(r"ain't", "not", phrase)
    phrase = re.sub(r"hadn't", "had not", phrase)
    phrase = re.sub(r"didn't", "did not", phrase)
    phrase = re.sub(r"wasn't", "was not", phrase)
    phrase = re.sub(r"aren't", "are not", phrase)
    phrase = re.sub(r"isn't", "is not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

# stop_words = stopwords.words('english')


In [4]:
mpstDF_processsed=mpstDF.copy()
# Type 1: Decontracted Text, The puncutation and stop words are still there
mpstDF_processsed["processed_synopsis_t1"]=mpstDF_processsed["plot_synopsis"].apply(lambda x: decontracted(" ".join(preprocess_string(x, [lambda x: x.lower(), strip_tags]))))
# Type 2 Decontracted Text Stop Words Removed
mpstDF_processsed["processed_synopsis_t2"]=mpstDF_processsed["plot_synopsis"].apply(lambda x: decontracted(" ".join(preprocess_string(x, [lambda x: x.lower(), strip_tags,remove_stopwords]))))

In [5]:
mpstDF_processsed

Unnamed: 0,imdb_id,title,plot_synopsis,tags,split,synopsis_source,processed_synopsis_t1,processed_synopsis_t2
0,tt0057603,I tre volti della paura,Note: this synopsis is for the orginal Italian...,"cult, horror, gothic, murder, atmospheric",train,imdb,note: this synopsis is for the orginal italian...,note: synopsis orginal italian release segment...
1,tt1733125,Dungeons & Dragons: The Book of Vile Darkness,"Two thousand years ago, Nhagruul the Foul, a s...",violence,train,imdb,"two thousand years ago, nhagruul the foul, a s...","thousand years ago, nhagruul foul, sorcerer re..."
2,tt0033045,The Shop Around the Corner,"Matuschek's, a gift store in Budapest, is the ...",romantic,test,imdb,"matuschek is, a gift store in budapest, is the...","matuschek is, gift store budapest, workplace a..."
3,tt0113862,Mr. Holland's Opus,"Glenn Holland, not a morning person by anyone'...","inspiring, romantic, stupid, feel-good",train,imdb,"glenn holland, not a morning person by anyone ...","glenn holland, morning person anyone is standa..."
4,tt0086250,Scarface,"In May 1980, a Cuban man named Tony Montana (A...","cruelty, murder, dramatic, cult, violence, atm...",val,imdb,"in may 1980, a cuban man named tony montana (a...","1980, cuban man named tony montana (al pacino)..."
5,tt1315981,A Single Man,George Falconer (Colin Firth) approaches a car...,"romantic, queer, flashback",val,imdb,george falconer (colin firth) approaches a car...,george falconer (colin firth) approaches car a...
6,tt0249380,Baise-moi,Baise-moi tells the story of Nadine and Manu w...,"gothic, cruelty, violence, cult, revenge, sadist",train,wikipedia,baise-moi tells the story of nadine and manu w...,baise-moi tells story nadine manu violent spre...
7,tt0408790,Flightplan,Kyle Pratt (Jodie Foster) is a propulsion engi...,"mystery, suspenseful, action, murder, flashback",train,imdb,kyle pratt (jodie foster) is a propulsion engi...,kyle pratt (jodie foster) propulsion engineer ...
8,tt0021079,Little Caesar,Small-time Italian-American criminals Caesar E...,violence,train,imdb,small-time italian-american criminals caesar e...,small-time italian-american criminals caesar e...
9,tt1615065,Savages,The movie begins with a video being shot of me...,"revenge, neo noir, murder, violence, flashback",train,imdb,the movie begins with a video being shot of me...,movie begins video shot men hands tied backs. ...


In [6]:
mpstDF_processsed["plot_synopsis"][1]

'Two thousand years ago, Nhagruul the Foul, a sorcerer who reveled in corrupting the innocent and the spread of despair, neared the end of his mortal days and was dismayed. Consumed by hatred for the living, Nhagruul sold his soul to the demon Lords of the abyss so that his malign spirit would survive. In an excruciating ritual, Nhagrulls skin was flayed into pages, his bones hammered into a cover, and his diseased blood became the ink to pen a book most vile. Creatures vile and depraved rose from every pit and unclean barrow to partake in the fever of destruction. The kingdoms of Karkoth were consumed by this plague of evil until an order of holy warriors arose from the ashes. The Knights of the New Sun swore an oath to resurrect hope in the land. The purity of their hearts was so great that Pelor, the God of Light, gave the Knights powerful amulets with which to channel his power. Transcendent with divine might, the Knights of the New Sun pierced the shadow that had darkened the land

In [8]:
!pip install sentence-transformers

Collecting sentence-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ce/4b/0add07b1eebbbe83e77fb5ac4e72e87046c3fc2c9cb16f7d1cd8c6921a1d/sentence-transformers-0.3.7.2.tar.gz (59kB)
[K    100% |████████████████████████████████| 61kB 2.1MB/s ta 0:00:011
Collecting torch>=1.2.0 (from sentence-transformers)
[?25l  Downloading https://files.pythonhosted.org/packages/38/53/914885a93a44b96c0dd1c36f36ff10afe341f091230aad68f7228d61db1e/torch-1.6.0-cp36-cp36m-manylinux1_x86_64.whl (748.8MB)
[K    100% |████████████████████████████████| 748.8MB 62kB/s  eta 0:00:01 0% |▎                               | 5.7MB 30.6MB/s eta 0:00:25                | 212.1MB 53.8MB/s eta 0:00:10    33% |██████████▋                     | 247.3MB 27.2MB/s eta 0:00:19    74% |███████████████████████▊        | 555.2MB 33.2MB/s eta 0:00:06    74% |███████████████████████▉        | 558.7MB 33.9MB/s eta 0:00:06��████████▏     | 611.3MB 32.4MB/s eta 0:00:05
Building wheels for collected packages: sent

In [9]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

  from ._conv import register_converters as _register_converters
I1002 13:49:42.285541 140624986228544 SentenceTransformer.py:38] Load pretrained SentenceTransformer: distilbert-base-nli-mean-tokens
I1002 13:49:42.286511 140624986228544 SentenceTransformer.py:42] Did not find folder distilbert-base-nli-mean-tokens. Assume to download model from server.
I1002 13:49:42.288116 140624986228544 SentenceTransformer.py:63] Downloading sentence transformer model from https://sbert.net/models/distilbert-base-nli-mean-tokens.zip and saving it at /root/.cache/torch/sentence_transformers/sbert.net_models_distilbert-base-nli-mean-tokens
100%|██████████| 245M/245M [00:31<00:00, 7.84MB/s]   
I1002 13:50:20.532265 140624986228544 SentenceTransformer.py:92] Load SentenceTransformer from folder: /root/.cache/torch/sentence_transformers/sbert.net_models_distilbert-base-nli-mean-tokens
I1002 13:50:22.339137 140624986228544 SentenceTransformer.py:116] Use pytorch device: cpu


getting transformer for one sentence first

In [10]:
sentence_embeddings = model.encode([mpstDF_processsed["plot_synopsis"][1]])

HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [11]:
sentence_embeddings

array([[ 2.00769797e-01, -9.11870301e-01,  7.45868862e-01,
        -8.69752526e-01, -5.72261155e-01, -2.49493554e-01,
        -4.35942784e-02, -1.73006475e-01,  1.02805507e+00,
         3.39111567e-01,  1.04070878e+00,  6.70556843e-01,
        -4.23018456e-01,  1.20352006e+00,  2.48973727e-01,
        -1.04121752e-01,  9.62612748e-01, -1.62691414e-01,
        -1.96168661e-01,  3.53351608e-02, -7.67696798e-01,
         6.06840432e-01,  1.87216759e-01,  1.72411036e+00,
        -8.94729614e-01,  1.45611256e-01,  4.15587157e-01,
        -5.73871061e-02,  5.40117919e-01,  4.60784733e-01,
         9.81056869e-01,  8.15373659e-02, -3.35227728e-01,
        -3.86883527e-01,  5.83323359e-01, -2.33469397e-01,
         5.19055068e-01, -7.40753651e-01,  7.54358292e-01,
        -2.56629676e-01, -7.53906786e-01, -1.19499728e-01,
         5.51820219e-01,  1.14514792e+00, -1.61765620e-01,
        -5.59474528e-01, -8.42856020e-02,  2.21697435e-01,
        -6.45271599e-01, -9.05689150e-02, -5.68897314e-0

In [16]:
embeddings_type1=model.encode(list(mpstDF_processsed["processed_synopsis_t1"]))

HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=464.0), HTML(value='')))




In [17]:
np.savez("embeddings_type1.npz",embeddings_type1)

In [18]:
embeddings_type2=model.encode(list(mpstDF_processsed["processed_synopsis_t2"]))

HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=464.0), HTML(value='')))




In [19]:
np.savez("embeddings_type2.npz",embeddings_type2)

In [23]:
embeddings_type1

array([[-0.23652047, -0.46730292,  0.62800884, ..., -0.3478354 ,
         0.17314139, -0.6017166 ],
       [ 0.2007698 , -0.9118703 ,  0.74586886, ..., -0.7119921 ,
         0.29034543, -0.35151947],
       [-0.54077786,  0.09937867,  0.61585516, ..., -0.38611546,
         0.10635425, -0.26011148],
       ...,
       [-0.23731072, -0.38172564,  0.07030658, ...,  0.16959587,
         0.52745634, -1.0168482 ],
       [-0.3484214 , -0.721707  ,  0.16562451, ..., -0.51939166,
         0.4619452 , -0.33824992],
       [-0.44174305, -0.1110604 , -0.01783825, ..., -0.34670058,
         0.34106705, -0.15129046]], dtype=float32)

In [22]:
embeddings_type2

array([[-0.40418392, -0.47878775,  0.7580885 , ..., -0.47598928,
         0.10172412, -0.33533862],
       [ 0.2230832 , -0.48024565,  0.75589806, ..., -0.75046986,
         0.44050843, -0.9165367 ],
       [-0.70129013,  0.2842256 ,  0.7363471 , ..., -0.50889647,
         0.14786258, -0.09948996],
       ...,
       [-0.26456812, -0.44883907,  0.15373003, ..., -0.0242345 ,
         0.7692909 , -1.2379875 ],
       [-0.2578964 , -0.7907244 ,  0.48381078, ..., -0.5863774 ,
         0.67240703, -0.13996066],
       [-0.4584682 , -0.02483118, -0.0885824 , ..., -0.4353699 ,
         0.48302737, -0.12356079]], dtype=float32)

So Finally the embeddings have been generated and saved to corresponding files. Caution: Takes almost 1 hour to execute and generate both sets of embedding.

Checking if the files are correctly Saved

In [29]:
t1_em_check= np.load("embeddings_type1.npz")
t2_em_check= np.load("embeddings_type2.npz")
print(np.array(t1_em_check))
print(np.array(t2_em_check))

<numpy.lib.npyio.NpzFile object at 0x7fe4c9d1bd30>
<numpy.lib.npyio.NpzFile object at 0x7fe4c9ee5518>


Saving both embeddings in single file

In [30]:
np.savez("embeddings.npz",t1=embeddings_type1,t2=embeddings_type2)

In [31]:
np.load("embeddings.npz")

<numpy.lib.npyio.NpzFile at 0x7fe4c9e38b70>

In [35]:
em_check=np.load("embeddings.npz")
print("t1")
print(em_check["t1"])
print("t2")
print(em_check["t2"])

t1
[[-0.23652047 -0.46730292  0.62800884 ... -0.3478354   0.17314139
  -0.6017166 ]
 [ 0.2007698  -0.9118703   0.74586886 ... -0.7119921   0.29034543
  -0.35151947]
 [-0.54077786  0.09937867  0.61585516 ... -0.38611546  0.10635425
  -0.26011148]
 ...
 [-0.23731072 -0.38172564  0.07030658 ...  0.16959587  0.52745634
  -1.0168482 ]
 [-0.3484214  -0.721707    0.16562451 ... -0.51939166  0.4619452
  -0.33824992]
 [-0.44174305 -0.1110604  -0.01783825 ... -0.34670058  0.34106705
  -0.15129046]]
t2
[[-0.40418392 -0.47878775  0.7580885  ... -0.47598928  0.10172412
  -0.33533862]
 [ 0.2230832  -0.48024565  0.75589806 ... -0.75046986  0.44050843
  -0.9165367 ]
 [-0.70129013  0.2842256   0.7363471  ... -0.50889647  0.14786258
  -0.09948996]
 ...
 [-0.26456812 -0.44883907  0.15373003 ... -0.0242345   0.7692909
  -1.2379875 ]
 [-0.2578964  -0.7907244   0.48381078 ... -0.5863774   0.67240703
  -0.13996066]
 [-0.4584682  -0.02483118 -0.0885824  ... -0.4353699   0.48302737
  -0.12356079]]


Hence confirmed that the data is correctly saved.