In [1]:
import json 
import pandas as pd

In [2]:
with open ("Lyrics_JohnMayer.json") as file:
    
    data = json.load(file)
    
    master_list = []
    
    for row in data['songs']:
        
        temp_dict = {}
        
        try: 
            temp_dict["title"] = row["title"]
        except:
            temp_dict["title"] = None
        
        try: 
            temp_dict["release_date"] = row["release_date"]
        except: 
            temp_dict["release_date"] = None
            
        try: 
            temp_dict["album"] = row["album"]["name"]
        except: 
            temp_dict["album"] = None
            
        try:     
            temp_dict["lyrics"] = row["lyrics"]
        except: 
            temp_dict["lyrics"] = None
        
        master_list.append(temp_dict)

In [3]:
df = pd.DataFrame(master_list)

In [4]:
df.dropna(inplace=True)

In [5]:
df['release_date'] = pd.to_datetime(df.release_date, format='%Y-%m-%d')

In [6]:
df["year"] = df["release_date"].dt.strftime('%Y')

In [7]:
pd.set_option('display.max_rows', None)

In [9]:
# df

In [11]:
df.to_pickle("Mayer.pkl")

In [3]:
# df.lyrics[0]

In [4]:
# df.lyrics[1]

In [10]:
import re
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
custom_stop_words = ['oh', 'yeah', 'ah', 'ay', 'ooh']
stop_words.update(custom_stop_words)

def clean_lyrics(lyrics):
    # Remove text inside brackets and parentheses
    lyrics = re.sub(r'\[.*?\]|\(.*?\)', '', lyrics)
    # Remove numbers
    lyrics = re.sub(r'\d+', '', lyrics)
    # Remove punctuations and newline characters
    lyrics = re.sub(r'[^\w\s]', '', lyrics).replace('\n', ' ')
    # Convert to lowercase
    lyrics = lyrics.lower()
    
     # Remove stop words
    lyrics = ' '.join([word for word in lyrics.split() if word not in stop_words])
    
    # Split words containing "embed" and keep only the non-"embed" part
    lyrics = ' '.join([word.split('embed')[0] if 'embed' in word else word for word in lyrics.split()])
    
    return lyrics.strip()

# Example usage
df['clean_lyrics'] = df['lyrics'].apply(clean_lyrics)


In [5]:
# df.clean_lyrics[0]

In [6]:
# df.clean_lyrics[3]

In [13]:
df.to_pickle("Mayer_cleaned.pkl")

In [14]:
songs_by_album = df.groupby([df.album]).size().reset_index(name = 'count')
songs_by_album.sort_values(by='count', ascending=False).reset_index(drop=True)

Unnamed: 0,album,count
0,Born and Raised,13
1,Continuum,13
2,Room For Squares [Japanese Edition],13
3,Battle Studies,12
4,The Search for Everything,12
5,Paradise Valley,11
6,Sob Rock,10
7,Heavier Things,9
8,Inside Wants Out,5
9,Where The Light Is: John Mayer Live in Los Ang...,5


In [15]:
album_list = songs_by_album['album'].tolist()

In [16]:
# album_list

['Any Given Thursday',
 'Battle Studies',
 'Born and Raised',
 'Continuum',
 'Heavier Things',
 'Inside Wants Out',
 'Paradise Valley',
 'Room For Squares [Japanese Edition]',
 'Sob Rock',
 'The Search for Everything',
 'Waiting on the World to Change [Limited Edition EP]',
 'Where The Light Is: John Mayer Live in Los Angeles',
 'Yes We Can: Voices of a Grassroots Movement']

In [17]:
album_drop = ['Waiting on the World to Change [Limited Edition EP]','Yes We Can: Voices of a Grassroots Movement']

In [18]:
df = df[df.album.isin(album_drop) == False]

In [19]:
songs_by_album = df.groupby([df.album]).size().reset_index(name = 'count')
songs_by_album.sort_values(by='count', ascending=False).reset_index(drop=True)

Unnamed: 0,album,count
0,Born and Raised,13
1,Continuum,13
2,Room For Squares [Japanese Edition],13
3,Battle Studies,12
4,The Search for Everything,12
5,Paradise Valley,11
6,Sob Rock,10
7,Heavier Things,9
8,Inside Wants Out,5
9,Where The Light Is: John Mayer Live in Los Ang...,5


In [21]:
grouped_lyrics = df.groupby('album')['clean_lyrics'].apply(lambda x: ' '.join(x)).reset_index()


In [7]:
# grouped_lyrics

In [23]:
recent_year = df.groupby('album')['year'].max().reset_index()

In [24]:
songs_by_album = grouped_lyrics.merge(recent_year, on='album')

In [28]:
songs_by_album = songs_by_album.sort_values('year', ascending=False, ignore_index=True)

In [8]:
# songs_by_album

In [30]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import numpy as np
import matplotlib.pyplot as plt
import PIL.Image



In [31]:
mask = np.array(PIL.Image.open("guitar_6.png"))
colormap = ImageColorGenerator(mask)

In [None]:
wordcloud = WordCloud(stopwords = STOPWORDS,
                      width=800, height=800,
                      mask = mask,
                      background_color = 'white',
                      contour_color = 'black',
                      contour_width = 2,
                      min_font_size = 3,
                      max_words = 400).generate(songs_by_album.loc[0, 'clean_lyrics'])

wordcloud.recolor(color_func = colormap)
plt.imshow(wordcloud) #interpolation = 'bilinear')
plt.axis("off")
plt.title("Any Given Thursday", fontsize=15)
plt.show()

In [None]:
# Create a 5x2 grid of subplots
fig, axs = plt.subplots(nrows=3, ncols=4, figsize=(30, 25))
fig.delaxes(axs[2,3])

for i in range(len(grouped_lyrics)):
    
    wordcloud = WordCloud(stopwords = set(list(STOPWORDS)+["im", "got"]),
                      width=1000, height=800,
                      mask = mask,
                      background_color = 'white',
                      contour_color = 'black',
                      contour_width = 2,
                      min_font_size = 3,
                      max_words = 400).generate(songs_by_album.loc[i, 'clean_lyrics'])

    wordcloud.recolor(color_func = colormap)

# Add a title to the word cloud with the corresponding album name
    album_title = songs_by_album.loc[i, 'album'] + '\n-Release Year '+ songs_by_album.loc[i, 'year'] 
    axs[i // 4, i % 4].set_title(album_title, fontsize=20)
    axs[i // 4, i % 4].imshow(wordcloud)
    axs[i // 4, i % 4].axis('off')
    

# Save the figure as a PNG file
plt.savefig('wordclouds.png', dpi=300, bbox_inches='tight')