In [1]:
import pandas as pd
import os
from string import punctuation
from nltk.corpus import stopwords
import re
import csv

# Cleaning Data for Later Analysis

In [2]:
list_of_files = []
for file in os.listdir("/Users/appau/Desktop/Projekt/Genres"):
    if file.endswith('txt'):
        list_of_files.append(file)

In [3]:
list_of_files

['alternative rock.txt',
 'country.txt',
 'folk.txt',
 'hard rock.txt',
 'heavy metal.txt',
 'hip hop.txt',
 'jazz.txt',
 'pop.txt',
 'progressive rock.txt',
 'rock.txt',
 'soul.txt']

In [4]:
with open('country.txt', 'r') as file:
    file = file.read()

In [5]:
file_replace = file.replace('\n\n', ' ')

In [6]:
file_replace_split = file_replace.split('"\n"')

In [7]:
file_replace_split[33]

'Before I fly and wave goodbye   I say to you ""days with you are the best years of my life""   But if I don\'t see you anymore   Keep my words safely stored   And i\'ll be back I promise once more      Chorus:   Goodbye, goodbye   Till I see you again   Goodbye, goodbye   I\'ll love and i\'ll miss you till then      Remember me \'cause I care   Where you are and how you fare   Get in touch if the wind blows in your face   I guess it\'s been too much fun   We\'ve shared and we\'ve won   Yes the best is yet to come      Repeat chorus      That one word hurts so bad   You leave the best you\'ve had   But you keep the faith and pray to return      Goodbye, goodbye   I\'ll love and i\'ll miss you till then   Goodbye, goodbye   Goodbye, goodbye      (c) 1990 maypop music (a division of wildcountry, inc.) (bmi). All rights reserved.  '

In [8]:
def replace(string, substitutions):

    substrings = sorted(substitutions, key=len, reverse=True)
    regex = re.compile('|'.join(map(re.escape, substrings)))
    return regex.sub(lambda match: substitutions[match.group(0)], string)

In [9]:
substitutions = {'Chorus': '', 'Chorus:': '', 'Repeat Chorus': '', 'Repeat chorus': '', 'repeat chorus': '', 'repeat Chorus': '', 'CHORUS': '', 'repeat CHORUS': '', 'chorus': '', 'Repeat and fade': '', 'Repeat': '', 'Repeat 1st': '', 'REPEAT': '', 'Verse:': '', 'Bridge:': '', 'Bridge': '', 'BRIDGE:': '', 'SOLO': '', 'Guitar solo': '', 'Guitar Solo': '', 'Refrain:': '', 'Refrain': '', 'Verse': ''}

In [10]:
output = replace(file_replace_split[83], substitutions)

In [11]:
output

"I was walkin'home from school   On a cold winter day,   Took a short cut through the woods   And I lost my way.   It was gettin' late, and I was scared and alone.   Then a kind old man took my hand, and led me home.   Mama couldn't see him,   But he was standing there,   And I knew in my heart   He was the answer to my prayer.      []   Oh, I believe there are Angels Among Us,   Sent down to us from somewhere up above.   They come to you and me in our darkest hours   To show us how to live   To teach us how to give   To guide us with a light of love.      When life held troubled times   And had me down on my knees   There's always been someone   To come along and comfort me   A kind word from a stranger   To lend a helping hand   A phone call from a friend   Just to say I understand   Now ain't it kind of funny   At the dark end of the road   Someone light the way with just a single ray of hope.      []      They wear so many faces,   Show up in the strangest places   And grace us wit

In [12]:
list_of_genre_files = [file for file in list_of_files if ".txt" in file]

In [13]:
list_of_genre_files

['alternative rock.txt',
 'country.txt',
 'folk.txt',
 'hard rock.txt',
 'heavy metal.txt',
 'hip hop.txt',
 'jazz.txt',
 'pop.txt',
 'progressive rock.txt',
 'rock.txt',
 'soul.txt']

In [14]:
def clean_text(text):
    
    tokens = text.split()
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens]
    
    tokens = [word for word in tokens if word.isalpha()]
    
    stop_words = set(stopwords.words('english'))
    
    tokens = [word for word in tokens if not word in stop_words]
    
    tokens = [word for word in tokens if len(word) > 2]
    
    return tokens

In [15]:
def clean_files():
    char_list = ['Chorus', 'Repeat Chorus', 'Repeat chorus', 'repeat chorus', 'repeat Chorus', 'CHORUS', 'repeat CHORUS', 'chorus', 'Repeat and fade', 'Repeat', 'Repeat 1st', 'REPEAT', 'Verse:', 'Bridge:', 'Bridge', 'BRIDGE:', 'SOLO', 'Guitar solo', 'Guitar Solo', 'Refrain:', 'Refrain', 'Verse']
    for file in list_of_genre_files:
        genre = file.partition('.')[0]
        with open(file, 'r') as file:
            file = file.read()
        file_replace = file.replace('\n\n', ' ')
        file_replace_split = file_replace.split('"\n"')
        song_list_final = []
        for song in file_replace_split:
            replaced = replace(song, substitutions)
            final = re.sub("[\(\[].*?[\)\]]", "", replaced)
            tokens = ' '.join(clean_text(final))
            song_list_final.append((tokens, genre))
        with open(genre + '_analysis.csv','w') as out:
            csv_out = csv.writer(out)
            csv_out.writerow(['song','genre'])
            for row in song_list_final:
                csv_out.writerow(row)

In [16]:
clean_files()

In [17]:
test_file = pd.read_csv('country_analysis.csv')

In [18]:
test_file

Unnamed: 0,song,genre
0,Calling calling angels calling calling angels ...,country
1,thought forever thought would last Gotta try m...,country
2,Somewhere mountains northern Alabama The Carol...,country
3,New York City theres snow ground And Californi...,country
4,All friends asking plan spend holidays People ...,country
5,Its time year whole world heart tot heart You ...,country
6,almost Christmas time There stood another line...,country
7,Come church wildwood come church vale spot dea...,country
8,Just outside Austin turned radio Turned satell...,country
9,Clear water blues thinkin Sittin fishin catchi...,country


In [19]:
test_file.song[0]

'Calling calling angels calling calling angels The night still moon bright saw distant light started walkin till found church outside town And stumbled closed door said sure Its baby lookin cant see knees Ive calling calling angels bring baby back must know shes tell know shes one knees tell please love Tell preacher man man Ive done wrong know really hurt Its lonely world livin ocean tears cryin greatest hour need done fool like calling calling angels bring baby back must know shes yes Ive fool And know shes one tell please And shes lonely Ill make understand That arms carry back heaven Yes Ive fool know shes one tell please Calling calling angels Hear calling calling angels calling calling angels Hear calling calling angels calling calling angels Hear calling calling angels calling calling angels'