In [None]:
! pip install nltk
import matplotlib.pyplot as plt
from os.path import isfile, join
from os import listdir
import zipfile
from nltk.corpus.reader import PlaintextCorpusReader
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
import re
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 
from collections import Counter
import operator

In [None]:
path_to_zip_file = "/home/ana/mytemp/DIKW/CDSP_Python/programma/datasets/data_for_windows.zip"
directory_to_extract_to = "/home/ana/temp/tmp1"
with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
    zip_ref.extractall(directory_to_extract_to)

In [None]:
corpus_root = directory_to_extract_to + "/data/lyrics/"
file_ext = "txt"
fileids = [f for f in listdir(corpus_root) if isfile(join(corpus_root, f)) and f.lower().endswith(file_ext)]
corpus = PlaintextCorpusReader(corpus_root, fileids)

In [None]:
# Lets put all the data in a dataframe, using song_id as index
df_corps = pd.DataFrame(columns=['song_id', 'Text'])
df_corps['song_id'] = [i for i in fileids]
df_corps['Text'] = [corpus.raw(i) for i in fileids]
df_corps.set_index('song_id', inplace=True)
df_corps

In [None]:
# remove songs having 10 lines or less
df_corps['line_count'] = np.nan
df_corps.loc[:, 'line_count'] =[df_corps.loc[i, "Text"].count('\n') for i in df_corps.index]
df_corps.sort_values("line_count")
df_corps = df_corps[df_corps.line_count > 10]
df_corps.sort_values("line_count")
df_corps

In [None]:
# Cleanup the text
def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.lower()
    sentence=sentence.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)  
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    # stem_words=[stemmer.stem(w) for w in filtered_words]
    # lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    return " ".join(filtered_words)


df_corps['cleanText'] = df_corps['Text'].map(lambda s:preprocess(s)) 
df_corps

In [None]:
# Create the term frequencies for a couple of songs

df_term_freq = pd.DataFrame(columns=['song_id', 'word', 'n', 'total'])

song_list = ["Pearl_Jam_Black.txt", "James_Brown_Sex_Machine.txt",
               "The_Blues_Brothers_Everybody_Needs_Somebody_To_Love.txt","Justin_Timberlake_Cry_Me_A_River.txt"]

for song_id in song_list:
    list_with_words = ' '.join([df_corps.loc[song_id, 'cleanText']]).lower().split()
    freq = nltk.FreqDist(list_with_words)
    max = len(freq.most_common())
    if max > 100: max = 100
    total = len(list_with_words)
    for i in range(0,max):
        word = freq.most_common()[i][0]
        n = freq.most_common()[i][1]
        # safe way to append rows to dataframe
        df2 = pd.DataFrame([[song_id, word, n ,total]], columns=['song_id', 'word', 'n', 'total'])
        df_term_freq = pd.concat([df_term_freq, df2])

df_term_freq["term_frequency"] = df_term_freq.apply(lambda row: row["n"] / row.total, axis=1)

df_term_freq.sort_values("n", ascending=False)

## Zipf’s law

Zipf’s law states that the frequency that a word appears is inversely proportional to its rank. Lets have a look at a single song and see if the frequency times the rank is sort of constant. As the rank increments by one, two words having the same frequency have a different frequency*rank value

In [None]:
frequency = {}

# for song in song_list:
#     cnt = Counter()
#     total_words = len(corpus.raw(song).lower().split())

song="Michael_Jackson_Wanna_Be_Startin__Something.txt"
words_doc = df_corps.loc[song]['cleanText'].split()

for word in words_doc:
    count = frequency.get(word, 0)
    frequency[word] = count + 1

rank = 1
column_header = ['Rank', 'Frequency', 'Frequency * Rank']
df_rank = pd.DataFrame(columns = column_header)
collection = sorted(frequency.items(), key = operator.itemgetter(1), reverse = True)

for word, freq in collection:
    df_rank.loc[word] = [rank, freq, rank * freq]
    rank = rank + 1
df_rank.loc[df_rank["Frequency"]>10]


Lets do the same with more songs, and now we will give words of a song having the same frequency the same rank value

In [None]:
# rank("min", ...) will give all the words in a song with the same number of occurances the same rank
df_term_freq["rank"] = df_term_freq.groupby("song_id")["n"].rank("min", ascending=False)
df_term_freq.sort_values("rank", ascending=True)

And plot the ranks of each song against the term frequencies:

In [None]:
df_term_freq = df_term_freq.sort_values("rank", ascending=False)
fig = plt.figure()
ax = fig.add_subplot(2, 1, 1)
for song_id in song_list:
    df_term_freq_selected = df_term_freq[df_term_freq['song_id'] == song_id]
    x = df_term_freq_selected['rank']
    y = df_term_freq_selected['term_frequency']
    line, = ax.plot(x,y)
ax.set_xscale('log')
ax.set_yscale('log')
plt.show()

## Fit exponent of power law

We can try to fit this straight line and estimate its parameters.

In [None]:
from sklearn.linear_model import LinearRegression

df_term_freq_selected = df_term_freq[df_term_freq['song_id'].isin(song_list)]
df_term_freq_selected = df_term_freq_selected[df_term_freq_selected['rank'].between(1,100) ]

# print(df_term_freq_selected)

x_log = np.log10(df_term_freq_selected['rank'].values.reshape(-1, 1))
# print(x_log)
y_log = np.log10(df_term_freq_selected['term_frequency'].values.reshape(-1,1))


y_pred_log = LinearRegression()
y_pred_log.fit(x_log, y_log)
# model = LinearRegression().fit(x, y)
print(f"intercept: {y_pred_log.intercept_}")
print(f"slope: {y_pred_log.coef_}")

l_model= y_pred_log.predict(x_log)
# print(l_model)

# plt.scatter(X_test_1og, Y_test_log,  color='gray')
plt.plot(10 ** x_log, 10 ** l_model, color='red', linewidth=2)
plt.show()

Classic versions of Zipf’s law have:

$frequency∝\frac{1}{rank}$

and we have gotten quite close to -1 here!

Finally we draw the fitted line with the actual data:

In [None]:
df_term_freq = df_term_freq.sort_values("rank", ascending=False)
fig = plt.figure()
ax = fig.add_subplot(2, 1, 1)
for song_id in song_list:
    df_term_freq_selected = df_term_freq[df_term_freq['song_id'] == song_id]
    x = df_term_freq_selected['rank']
    y = df_term_freq_selected['term_frequency']
    line, = ax.plot(x,y)

x_vals = np.array(ax.get_xlim())
y_vals = y_pred_log.intercept_ + y_pred_log.coef_ * x_vals
print(x_vals)
print(y_vals[0])
ax.plot(x_vals, y_vals[0], '--')
plt.plot(10 ** x_log, 10 ** l_model, color='red', linewidth=2)
ax.set_xscale('log')
ax.set_yscale('log')
plt.show()