# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
import string
from nltk.corpus import stopwords
from future.utils import iteritems

# Reading Data
Data Source: [55000+ Song Lyrics](https://www.kaggle.com/mousehead/songlyrics)

In [2]:
df = pd.read_csv("songdata.csv")
df.shape

(57650, 4)

In [3]:
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


#### Number of Artists

In [4]:
df["artist"].nunique()

643

#### Utility Function to remove Punctuations

In [5]:
def remove_punctuation(s):
    return s.translate(str.maketrans('', '', string.punctuation))

In [6]:
def add2dict(d, k, v):
    if k not in d:
        d[k] = []
    d[k].append(v)

### Building a Corpus

In [7]:
corpus = df["text"].values.tolist()
corpus = " ".join(corpus)
corpus = corpus.lower().split("\n")

In [8]:
len(corpus)

2370067

In [9]:
corpus[0:5]

["look at her face, it's a wonderful face  ",
 'and it means something special to me  ',
 'look at the way that she smiles when she sees me  ',
 'how lucky can one fellow be?  ',
 '  ']

In [10]:
def states_transition_dicts(corpus):
    initial = {}
    second_word = {}
    transitions = {}
    for line in corpus:
        tokens = remove_punctuation(line.rstrip().lower()).split()

        T = len(tokens)

        for i in range(T):
            t = tokens[i]
            if i==0:
                initial[t] = initial.get(t, 0.) + 1
            else:
                t_1 = tokens[i-1]
                if i == T-1:
                    add2dict(transitions, (t_1, t), 'END')
                if i == 1:
                    add2dict(second_word, t_1, t)
                else:
                    t_2 = tokens[i-2]
                    add2dict(transitions, (t_2, t_1), t)
                    
    return initial, second_word, transitions

In [11]:
initial, second_word, transitions = states_transition_dicts(corpus)

### Normalizing the Distributions

In [12]:
def normalize_distribution(initial):
    initial_total = sum(initial.values())
    for t, c in iteritems(initial):
        initial[t] = c / initial_total
    return initial

In [13]:
initial = normalize_distribution(initial)

#### Building a Dictionary of Probabilities

In [14]:
def list2pdict(ts):
    d = {}
    n = len(ts)
    for t in ts:
        d[t] = d.get(t, 0.) + 1
    for t, c in iteritems(d):
        return d

In [15]:
def build_probability_distributions(second_word, transitions):
    for t_1, ts in iteritems(second_word):
        second_word[t_1] = list2pdict(ts)
    for k, ts in iteritems(transitions):
        transitions[k] = list2pdict(ts)
        
    return second_word, transitions

In [16]:
second_word, transitions = build_probability_distributions(second_word, transitions)

In [17]:
def sample_word(d):
    p0 = np.random.random()
    cumulative = 0
    for t, p in iteritems(d):
        cumulative += p
        if p0 < cumulative:
            return t
    assert(False)

In [18]:
def generate(initial, second_word, transitions):
    for i in range(4):
        sentence = []
        w0 = sample_word(initial)
        sentence.append(w0)
        w1 = sample_word(second_word[w0])
        sentence.append(w1)
        while True:
            w2 = sample_word(transitions[(w0, w1)])
            if w2 == 'END':
                break
            sentence.append(w2)
            w0 = w1
            w1 = w2
        print(" ".join(sentence))

In [19]:
generate(initial, second_word, transitions)

why i had to go
change that frown back to me
one among the others nothing much to say goodbye
arrows wouldnt do if i can see that youre oh so sad so quiet


# Generating Eminem Style Rap

In [20]:
eminem = df[(df["artist"] == "Eminem")]
eminem.shape

(70, 4)

In [21]:
eminem.head()

Unnamed: 0,artist,song,link,text
5062,Eminem,25 to Life,/e/eminem/25+to+life_20883525.html,Too late for the other side \nCaught in a cha...
5063,Eminem,3 A.M.,/e/eminem/3+am_20789506.html,Oh oh \nOh(yea) oh(yea) oh(yea) \nOh oh \nO...
5064,Eminem,3 Verses,/e/eminem/3+verses_20049939.html,I'm the illest rapper to hold a cordless \nPa...
5065,Eminem,Above The Law,/e/eminem/above+the+law_20914210.html,"[Intro:] \nThe poor stay poor, the rich get r..."
5066,Eminem,Buffalo Bill,/e/eminem/buffalo+bill_20866830.html,It fits perfect \n \nBetter watch out sucka ...


In [22]:
eminem_corpus = eminem["text"].values.tolist()
eminem_corpus = " ".join(eminem_corpus)
eminem_corpus = eminem_corpus.lower().split("\n")

In [23]:
len(eminem_corpus)

5160

In [24]:
eminem_corpus[:5]

['too late for the other side  ',
 'caught in a chase  ',
 'twenty five to life  ',
 'too late for the other side  ',
 'caught in a chase  ']

In [25]:
def generate_eminem_rap(eminem_corpus):
    initial, second_word, transitions = states_transition_dicts(eminem_corpus)
    initial = normalize_distribution(initial)
    second_word, transitions = build_probability_distributions(second_word, transitions)
    return initial, second_word, transitions

In [26]:
initial_em, second_word_em, transitions_em = generate_eminem_rap(eminem_corpus)

In [27]:
generate(initial_em, second_word_em, transitions_em)

boys like how vanilla ice gonna diss you
tell em left or right head or gut
hip hop in its
we blowin up like spontaneous human combustion
