# Bob Dylan songs 1961 - 2020


**This dataset contains songs from years between 1961 to 2020 written by Bob Dylan.**


**There are 4 columns:**

* release_year - year when song was released first time,
* album - name of the album where track occurs,
* title - title of the song,
* lyrics - lyrics of the track

**Acknowledgements**


This dataset contains only songs that Bob Dylan himself has written and published.
There's many songs that Bob Dylan only covered so I didn't include them because he's not the original author.
For instance album World Gone Wrong contains only old folks songs.


**Inspiration**


I'm great Bob Dylan fan. I listen to his songs almost every day from many years.
I also play them and sing them so now I decided to make dataset and play with them on kaggle as well.

I also wanna mention - of course it's not all Bob Dylan songs - there might be more I don't know.

**For instance:**


I didn't include Wanted Man - written by Bob Dylan but played mostly by Johnny Cash. There's one video of Bob Dylan play this song with Johhny Cash:


[video](https://www.youtube.com/watch?v=iiRMfb3Z9hg)

amazing version by the way... 

anyway:

Love Is Just Four Letter Word - song written by Bob Dylan but never occur on his albums or lives so I didn't include it


And I'm sure there's more, more songs that Dylan written but we don't know about them because Bob didn't want to publish them.


## 1. Wordclouds

In [None]:
import pandas as pd

from collections import Counter, OrderedDict
import matplotlib.pyplot as plt

import plotly.express as px
import plotly.graph_objects as go

from wordcloud import WordCloud, STOPWORDS
from PIL import Image

import re

import numpy as np

import requests
from io import BytesIO

In [1]:
df = pd.read_csv('clear.csv')

<IPython.core.display.Javascript object>

In [None]:
df.head()

In [None]:
df.tail()

This dataset is duplicates free

In [None]:
df[df.duplicated()]

In [None]:
def string_to_lower(series_name:str, data_frame=df):
    """this function takes name of pandas series from data frame
    changing all words to lowercase in this series 
    and saves it in place in data frame
    
    arguments:
    series_name: str
    data_frame: pandas DataFrame (by defult df)
    """
    
    df[series_name] = df[series_name].apply(
        lambda x: ' '.join(w for w in str(x). lower().split()))




In [None]:
string_columns = df.select_dtypes(exclude=[np.number]).columns

for column in string_columns:
    string_to_lower(column)

In [None]:
url = "https://raw.githubusercontent.com/Cloudy17g35/bob_dylan_songs/main/dylan_photo.jpeg"

In [None]:

def word_cloud_from_shape(column_name:str, data_frame=df, url=url):
    """this function takes column name from data fram
    returns wordcloud object with shape based on image(given as url)"""
    
    
    clear = []

    for lyric in data_frame[column_name]:

        words = lyric.split()

        for word in words:

            word = word.replace('’', "'").strip().replace(' ', '').replace(',', '')
            if word not in STOPWORDS:
                clear.append(word)
    
    freq_table = Counter(clear)
    
    
    url = url
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))


    dylan_mask = np.array(img)

    wc = WordCloud(background_color='black', max_words= 1000, mask=dylan_mask)

    
    
    wc.generate_from_frequencies(freq_table)

    plt.figure(figsize=(20,20))

    plt.xticks([])
    plt.yticks([])
    plt.imshow(wc, interpolation='nearest')
    plt.show()

In [None]:
for column in string_columns:
    print(column)
    word_cloud_from_shape(column)

In [None]:
def frequency_for_year(year: int):
    """this function takes integer(year when song was published)
    returns freq_table of words from songs from based on this particular year
    
    arguments: year: int
    
    returns: frequency table: dict"""
    
    unique_years: list = set(df['release_year'].values)
    
    # if year is not in years
    
    if year not in unique_years:
        return f"there's no songs from year {year}"
    
    temp_df = df[df['release_year'] == year]
    result = []
    
    
    # iterating overt lyrics
    for lyric in temp_df['lyrics']:
    
        words = lyric.split()
        for word in words:
            word = word.replace('’', "'").strip().replace(' ', '').replace(',', '')
                
            if word not in STOPWORDS:
                result.append(word)
                    
    return Counter(result)

In [None]:
def make_wordcloud_for_year(frequency: dict):
    
    
    """This function takes frequency of words saved in dictionary
    returns wordcloud object"""
    
    
    wc = WordCloud(max_font_size=40,width=500, height=200).generate_from_frequencies(frequency_for_year(year))
    
    plt.figure(figsize=(20, 20))
    plt.imshow(wc, interpolation='bilinear')
    plt.xticks([])
    plt.yticks([])
    plt.show()

In [None]:
# making loop and showing the wordcloud for every year since 1961 to 2020
for year in range(1960, 2021):
    
    try:
        print(year)
        make_wordcloud_for_year(frequency_for_year(year))
        
    except AttributeError:
        print(f"theres no song from this year!")

In [None]:
def frequency_for_years(start: int, end: int):
    """this function takes two integers start and end
    
    returns freq_table of words from year between start and end
    
    arguments:
    
    start: int - first year
    end: int - last year
    
    returns: frequency table: dict"""
    
    temp_df = df[df['release_year'].between(start, end)]
    result = []
    
    
    for lyric in temp_df['lyrics']:
    
        words = lyric.split()
        for word in words:
            word = word.replace('’', "'").strip().replace(' ', '').replace(',', '')
                
            if word not in STOPWORDS:
                result.append(word)
                    
    return Counter(result)
    

In [None]:
def make_wordcloud_for_decade(frequency):
    
    
    """this function takes one paramer it's frequency of words in
    particular decade
    arguments:
    frequency: dict - dictionary with frequencies of words"""
    
    
    wc = WordCloud(max_font_size=40,width=500, height=200).generate_from_frequencies((frequency))
    
    plt.figure(figsize=(20, 20))
    plt.imshow(wc, interpolation='bilinear')
    plt.xticks([])
    plt.yticks([])
    plt.show()

In [None]:
start: int = 1960
end: int = 1970

for x in range(6):
    print(f"{start} - {end}")
    make_wordcloud_for_decade(frequency_for_years(start, end))
    start, end = end, end + 10



## 2 . SONGS DISTRIBUTION BY YEARS

Let's see distribution of songs for each year:

In [None]:
distribution = df['release_year'].value_counts().sort_index()

In [None]:
fig = px.bar(x=distribution.index, y=distribution.values)

fig.update_layout(font=dict(family='Lato', size=18, color='white'), 
                  title=dict(text='<b>Bob Dylan songs in years 1961- 2020<b>',
                           font=dict(size=30), x=.5), 
                  paper_bgcolor= 'black', plot_bgcolor='black',
                 xaxis = dict(title='Year of release', showgrid=False),
                 yaxis=dict(title= 'number_of_songs', showgrid=False))