In [1]:
import numpy as np  
import pandas as pd 
import re           
from keras.preprocessing.text import Tokenizer 
from keras_preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
import warnings

In [2]:
animeDf = pd.read_csv('./anime_with_synopsis.csv')
animeDf = animeDf.drop("Genres",axis=1)
animeDf = animeDf.rename(columns={"sypnopsis": "Synopsis"},)
animeDf.head()

Unnamed: 0,MAL_ID,Name,Score,Synopsis
0,1,Cowboy Bebop,8.78,"In the year 2071, humanity has colonized sever..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"other day, another bounty—such is the life of ..."
2,6,Trigun,8.24,"Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,7.27,ches are individuals with special powers like ...
4,8,Bouken Ou Beet,6.98,It is the dark century and the people are suff...


In [13]:
honorifics = set(['Mr.', 'Mrs.', 'Ms.', 'Dr.', 'Prof.', 'Rev.', 'Capt.', 'Lt.-Col.', 
'Col.', 'Lt.-Cmdr.', 'The Hon.', 'Cmdr.', 'Flt. Lt.', 'Brgdr.', 'Wng. Cmdr.', 
'Group Capt.' ,'Rt.', 'Maj.-Gen.', 'Rear Admrl.', 'Esq.', 'Mx', 'Adv', 'Jr.'] )
stopwords = '''a
about
above
after
again
against
all
am
an
and
any
are
aren't
as
at
be
because
been
before
being
below
between
both
but
by
can't
cannot
could
couldn't
did
didn't
do
does
doesn't
doing
don't
down
during
each
few
for
from
further
had
hadn't
has
hasn't
have
haven't
having
he
he'd
he'll
he's
her
here
here's
hers
herself
him
himself
his
how
how's
i
i'd
i'll
i'm
i've
if
in
into
is
isn't
it
it's
its
itself
let's
me
more
most
mustn't
my
myself
no
nor
not
of
off
on
once
only
or
other
ought
our
ours	ourselves
out
over
own
same
shan't
she
she'd
she'll
she's
should
shouldn't
so
some
such
than
that
that's
the
their
theirs
them
themselves
then
there
there's
these
they
they'd
they'll
they're
they've
this
those
through
to
too
under
until
up
very
was
wasn't
we
we'd
we'll
we're
we've
were
weren't
what
what's
when
when's
where
where's
which
while
who
who's
whom
why
why's
with
won't
would
wouldn't
you
you'd
you'll
you're
you've
your
yours
yourself
yourselves'''.split('\n')

In [14]:
import re
import string
def removeStopWords(sent):
    try:
        text = [word for word in re.split("\W+",sent) if word and word.lower() not in stopwords]  # filter out empty words
        return ' '.join(text)
    except:
        print(text)

def splitAndStrip(sent, removePunctuation=True):
    if not pd.isnull((sent)):
        final = []
        split_l = sent.split(' ')

        for word in split_l:
            if removePunctuation:
                word = re.sub('\'','',word)
                word = re.sub(',',' ',word)
            if '.' in word and word not in honorifics:
                final.append(word + '\n')
                continue
            final.append(word)
        final = ' '.join(final).rstrip()
        return final.split('\n')
    return ""

def processData(df):
    x = df.Synopsis.map(lambda x: splitAndStrip(x))
    return x
def countWords(para):
    counts = {}
    for x in para:
        for y in x:
            for k in y.split(' '):
                if k == '':
                    continue
                k = k.lower()
                if counts.get(k) is None:
                    counts[k] = 1
                else:
                    total = counts.get(k)
                    total = total+1
                    counts[k] = total
    return counts

In [15]:
x = processData(animeDf)
x = x.map(lambda x: [removeStopWords(g) for g in [y for y in x]])
t = countWords(x)
t = pd.Series(t)
t.head()

year         1286
2071            3
humanity      320
colonized      16
several       248
dtype: int64

In [16]:
t = t.where([len(x) > 3 for x in t.keys()]).dropna()
maxFreq = t.max()
weights = t.map(lambda x: x/maxFreq)
def sumWeights(para):
    index = 0
    weightmap = {}
    for sent in para:
        sentenceWeight = []
        for word in sent.split(' '):
            w = weights.get(word.lower())
            if w is not None:
                sentenceWeight.append(float(w)/len(sent))
        val = round(sum(sentenceWeight), 4)
        weightmap[index] = val 
        index = index + 1
    return weightmap

In [25]:
from operator import itemgetter

def makeSummary(weightMap: dict, df: pd.DataFrame):
    s = sorted(weightMap.items(), key=itemgetter(1), reverse=True)
    topSentences = s[:2]
    synop = splitAndStrip(df.Synopsis, removePunctuation=False)
    topIndicies = [x[0] for x in topSentences]
    if type(df.Synopsis) is str:
        # if 0 not in topIndicies:
        #     topIndicies.append(0)
        summary = [synop[x] for x in sorted(topIndicies)]
        return ' '.join(summary)

In [26]:
y = [sumWeights(y) for y in x]
s = [makeSummary(g,animeDf.iloc[i]) for i,g in enumerate(y)]
animeDf['Short Summary'] = s

In [28]:
i = 47
p = animeDf.iloc[i]['Short Summary']
print(animeDf.iloc[i]['Synopsis'],'\n\n',p)

Chiyo Mihama begins her high school career as one of the strangest students in her freshman class—a tiny, 10-year-old academic prodigy with a fondness for plush dolls and homemade cooking. But her homeroom teacher, Yukari Tanizaki, is the kind of person who would hijack a student's bike to avoid being late, so "strange" is a relative word. There certainly isn't a shortage of peculiar girls in Yukari-sensei's homeroom class. Accompanying Chiyo are students like Tomo Takino, an energetic tomboy with more enthusiasm than brains; Koyomi Mizuhara, Tomo's best friend whose temper has a fuse shorter than Chiyo; and Sakaki, a tall, athletic beauty whose intimidating looks hide a gentle personality and a painful obsession with cats. In addition, transfer student Ayumu Kasuga, a girl with her head stuck in the clouds, fits right in with the rest of the girls—and she has a few interesting theories about Chiyo's pigtails! Together, this lovable group of girls experience the ups and downs of school