In [None]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import requests
import math
pd.options.mode.chained_assignment = None
GENRE_PATTERN = re.compile("[^\/](19|20)\d{2}\s\/.*")
PATTERN_ALBUM = re.compile("(EP|ALBUM|MIXTAPE|TRACK|COMPILATION) review.*",flags = re.I)
SELF_TITLED = re.compile("self-titled", flags = re.I)
FEATURE_PATTERN = re.compile("ft.*", flags = re.I)
NUMBER_WORDS = {"zero": 0,"one":1, "two":2, "three":3, "four":4, "five":5, "six":6, "seven":7, "eight":8, "nine":9, "ten":10}

In [None]:
# Convert the number words to numbers
def num2words(num):
    num = str(int(num))
    if num.isdigit():
        return num
    else:
        num = num.lower()
        if num in NUMBER_WORDS:
            return NUMBER_WORDS[num]
        else:
            return num

In [None]:
# Split "-" on the 2nd occurence
def split2nd(string,sep="-"):
    string = str(string).encode('utf-8', 'ignore').decode('utf-8')
    string = SELF_TITLED.sub("self titled", string)

    if string.count("-") >= 2:
        artist,album = sep.join(string.split(sep, 2)[:2]), sep.join(string.split(sep, 2)[2:])
        return artist,album
    else:
        artist,album = string.split(sep)
        return artist,album

In [None]:
df = pd.read_csv("./reviews_videos.csv", index_col=0)

# Grab the title and artist
df['Title'] = df['Title'].str.replace("ep album", "").str.replace(PATTERN_ALBUM, '')
df['Title'] = df['Title'].str.replace("EP/", "")
df["Artist"],df["Album"] = zip(*df["Title"].apply(split2nd))

df['Album'] = df['Album'].str.replace(PATTERN_ALBUM, '')
df['Album'] = df['Album'].str.replace(FEATURE_PATTERN, '')
del df["Title"]

df["Album"].mask(df["Album"].str.contains("self titled", case=False, na=False), df["Artist"], inplace=True) # Replace self-titled with artist name
df['Score'] = df["Description"].str.extract("(^[A-Za-z]+\/10|[0-9]\/10)(?!\d)").replace(to_replace="\/10", value="", regex=True)
df = df[df['Score'].notna()] # Remove rows with no score
df['Score'].apply(num2words) # Convert number words to numbers

# Filter the sub-genre to the parent-genre
df["Genre"].mask(df["Genre"].str.contains(
    "r&b|soul|funk", case=False, na=False), "R&B", inplace=True)
df["Genre"].mask(df["Genre"].str.contains(
    "hip hop|rap|trap|drill", case=False, na=False), "Rap", inplace=True)
df["Genre"].mask(df["Genre"].str.contains(
    "rock|punk|emo|screamo|grunge|metal|hardcore", case=False, na=False), "Rock", inplace=True)
df["Genre"].mask(df["Genre"].str.contains(
    "jazz|bop", case=False, na=False), "Jazz", inplace=True)
df["Genre"].mask(df["Genre"].str.contains(
    "electro|dub|synth|house|dance|disco|edm|glitch|idm", case=False, na=False), "Electronic", inplace=True)
df["Genre"].mask(df["Genre"].str.contains(
    "ambient", case=False, na=False), "Ambient", inplace=True)
df["Genre"].mask(df["Genre"].str.contains(
    "pop", case=False, na=False), "Pop", inplace=True)
df["Genre"].mask(df["Genre"].str.contains(
    "folk", case=False, na=False), "Folk", inplace=True)
df["Genre"].mask(df["Genre"].str.contains(
    "country", case=False, na=False), "Country", inplace=True)


# Fix the genre when it isn't found

In [None]:
# If genre was not found, try to find it on Pitchfork
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

for index, value in df["Genre"].items():
    link = None

    if value not in ["R&B", "Rap", "Rock", "Jazz", "Electronic", "Ambient", "Pop", "Folk", "Country"]:
        
        try:
            artist = df["Artist"].loc[index]
            album = df["Album"].loc[index]
            artist_album = artist + album
            artist_album = artist_album.strip().replace(" ", "%20")
            print("https://pitchfork.com/search/?query="+artist_album)

            page = requests.get("https://pitchfork.com/search/?query=" + artist_album, headers=headers)

            # Parse with beautifulsoup
            soup = BeautifulSoup(page.content, 'html.parser')

            # Find the review link
            for a in soup.find_all("a", class_="review__link", href=True):
                print(f"The album is: {album.str.lower()}")
                print(a.find(class_="review__title-album").text.lower())
                if a.find(class_="review__title-album").text.lower() == album.str.lower():
                    print("Found it!")
                    link = a['href']
                    review_page = requests.get(
                        "https://pitchfork.com/" + link, headers=headers)

                    # Parse with beautifulsoup
                    soup = BeautifulSoup(review_page.content, 'html.parser')
                    df["Genre"].loc[index] = soup.find(class_="genre-list__link").string
                    break
        except:
            df["Genre"].loc[index] = "Unknown"


# Get highest value from redux verison

In [None]:
# If redux_df album is in df, then add the score to the original album
redux_df = df[ df["Album"].str.contains("redux",case=False, na=False) ]
df = df[ df["Album"].str.contains("redux",case=False, na=False) == False]
redux_df["Album"] = redux_df["Album"].str.replace("redux", "")

for index, value in redux_df["Album"].items():
    for index2, value2 in df["Album"].items():
        try:
            if value.lower().strip() == value2.lower().strip():
                df["Score"].loc[index2] = redux_df["Score"].loc[index]
        except Exception:
            pass


In [None]:
# Reorder the columns
df = df.loc[ 
    :,
    list(df.columns[5:7])
    + list(df.columns[4:5])
    + list(df.columns[1:4])
    + list(df.columns[7:8])
    + list(df.columns[0:1])
]
df = df[df["Score"].notna()] #Remove 
df.to_csv("./fantano_reviews.csv")