In [14]:
import pandas as pd 
import numpy as np
VOCAB = pd.read_csv("OUTPUTS/VOCAB_AllSongs.csv")
TOKENS = pd.read_csv("OUTPUTS/TOKENS_AllSongs.csv")

In [12]:
# Add some more metadata to VOCAB; make term_str index, plus add a aterm_str column
VOCAB = VOCAB.set_index('term_str')
VOCAB['term'] = VOCAB.index

In [13]:
VOCAB.head()

Unnamed: 0_level_0,n,p,char_len,term
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
you,282,0.035294,3,you
the,267,0.033417,3,the
I,180,0.022528,1,I
me,155,0.019399,2,me
to,146,0.018273,2,to


In [18]:
TOKENS.iloc[87:100]

Unnamed: 0,Line_id,Song_id,Token_num,token_str
87,17,Song 0,4,dukedom
88,19,Song 0,0,'So
89,19,Song 0,1,hey
90,19,Song 0,2,yeah
91,19,Song 0,3,yeah
92,19,Song 0,4,yeah'
93,20,Song 0,0,Yes-a
94,20,Song 0,1,I
95,20,Song 0,2,oh
96,20,Song 0,3,I'm


In [None]:
def token_to_padded(token, grouper=['Line_id'], term_str='token_str'):
    ohco = token.index.names # We preserve these since they get lost in the shuffle
    padded = token.groupby(grouper)\
        .apply(lambda x: '<s> ' + ' '.join(x[term_str]) + ' </s>')\
        .apply(lambda x: pd.Series(x.split()))\
        .stack().to_frame('term_str')
    #padded.index.names = ohco
    return padded

In [20]:
PADDED = token_to_padded(TOKENS, grouper='Line_id', term_str='token_str')

  .apply(lambda x: '<s> ' + ' '.join(x[term_str]) + ' </s>')\


In [22]:
PADDED

Unnamed: 0_level_0,Unnamed: 1_level_0,term_str
Line_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,<s>
1,1,'Duke
1,2,Duke
1,3,Duke
1,4,Duke
...,...,...
898,3,that
898,4,look
898,5,at
898,6,that)


In [27]:
# NGRAM from Padded
ngram = 3
widx = [f"w{i}" for i in range(ngram)]

def padded_to_ngrams(padded, grouper=['Line_id'], n=2):
    
    ohco = padded.index.names
    ngrams = padded.groupby(grouper, group_keys=False)\
        .apply(lambda x: pd.concat([x.shift(0-i) for i in range(n)], axis=1))\
        .reset_index(drop=True)
    ngrams.index = padded.index
    ngrams.columns = widx
    
    return ngrams


In [28]:
NGRAMS = padded_to_ngrams(PADDED, 'Line_id', ngram)

In [29]:
NGRAMS

Unnamed: 0_level_0,Unnamed: 1_level_0,w0,w1,w2
Line_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,<s>,'Duke,Duke
1,1,'Duke,Duke,Duke
1,2,Duke,Duke,Duke
1,3,Duke,Duke,of
1,4,Duke,of,Earl'
...,...,...,...,...
898,3,that,look,at
898,4,look,at,that)
898,5,at,that),</s>
898,6,that),</s>,


In [30]:
# Genrate Models

def ngrams_to_models(ngrams):
    global widx
    n = len(ngrams.columns)
    model = [None for i in range(n)]
    for i in range(n):
        if i == 0:
            model[i] = ngrams.value_counts('w0').to_frame('n')
            model[i]['p'] = model[i].n / model[i].n.sum()
            model[i]['i'] = np.log2(1/model[i].p)
        else:
            model[i] = ngrams.value_counts(widx[:i+1]).to_frame('n')    
            model[i]['cp'] = model[i].n / model[i-1].n
            model[i]['i'] = np.log2(1/model[i].cp)
        model[i] = model[i].sort_index()
    return model

In [31]:
M = ngrams_to_models(NGRAMS)

In [36]:
M[2][100:125]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n,cp,i
w0,w1,w2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
'Doing,what,you,1,1.0,0.0
'Dont,ease,the,1,1.0,0.0
'Du,du,du,2,1.0,0.0
'Duke,Duke,Duke,8,1.0,0.0
'Ethel,you,shameless',1,1.0,0.0
'Even,white,boys,2,1.0,0.0
'Everywhere,you,go,2,1.0,0.0
'Flashed,her,right,1,1.0,0.0
'Freddie,Jackson,everything,1,1.0,0.0
'From,the,Chi,1,0.5,1.0


In [33]:
# Generate Sentences Using Our Bigram Model based on the probabilities of words in sequence

def generate_text(M, n=250):
    
    if len(M) < 3:
        raise ValueError("Must have trigram model generated.")
    
    # Start list of words
    first_word = M[1].loc['<s>'].sample(weights='cp').index[0]
    
    words = ['<s>', first_word]
    
    for i in range(n):
        
        bg = tuple(words[-2:])

        # Try trigram model
        try:
            next_word = M[2].loc[bg].sample(weights='cp').index[0]

        # If not found in model, back off ...
        except KeyError as e1:
            try:
                # Get the last word in the bigram
                ug = bg[1]
                next_word = M[1].loc[ug].sample(weights='cp').index[0]
            
            except KeyError as e2:
                next_word = M[0].sample(weights='p').index[0]
                
        words.append(next_word)
    
    
    text = ' '.join(words[2:])
    print('\n\n'.join([str(i+1) + ' ' + line.replace('<s>','')\
        .strip().upper() for i, line in enumerate(text.split('</s>'))]))

In [40]:
generate_text(M, n=500)

1 STOP

2 A MOHAIR SUIT

3 'YEAH I DID.'

4 'I LOVE THE WAY LOVE'S SUPPOSED TO BE

5 SUPPOSED TO BE IN LOVE IN EAST L.A'

6 MAMAS COME ON'

7 BY CARLOS SANTANA

8 AND I'M THINKIN' BOUT STICKIN' 'TO THE BEANPOLE DAMES IN THE WORLD LIKE I'M THE ONLY ONE WHO KNOWS YOUR HEART 'ONLY GIRL IN THE WORLD

9 WOULD YOU?'

10 MAYBE THEY'RE BLINDED

11 SEXY BODY GO (BAD BOY)'

12 

13 RINGS)

14 'UH DANCE FOR ME

15 THROW DOWN ' 'DIAL 1-900-MIXALOT' 'AND KICK THEM NASTY THOUGHTS'

16 FAKE FAKE

17 

18 C'MON

19 GOT IT GOIN' ON A LOT OF SIMPS WON'T LIKE THIS SONG 'CAUSE THEM PUNKS LIKE TO HIT 'EM AND I PULL UP TOUGH' 'CAUSE YOU AIN'T THAT AVERAGE GROUPIE I'VE SEEN THEM DANCIN' TO HELL WITH ROMANCIN' SHE'S SWEAT WET GOT IT GOIN' LIKE A MAN' 'ONLY GIRL IN THE WORLD

20 TOO LATE.'

21 TO BE IN MAGAZINES'

22 KICK THEM NASTY THOUGHTS'

23 DEFINITELY SET THIS PARTY OFF RIGHT

24 

25 'I PUMP THAT'

26 

27 HIGH HIGH

28 PLAYBOY 'CAUSE SILICONE PARTS ARE MADE FOR TOYS I WANT YOU KNOW WHAT YOU'RE WORTH

2