# Merge and clean datasets

In [1]:
import pandas as pd

#### Load the dataframes

In [2]:
def load_data():
    df1 = pd.read_csv('discorgs_collection.csv')
    df2 = pd.read_excel('all_discs.xlsx')
    return df1, df2

In [3]:
df_discorgs, df_home = load_data()

> df_discorgs refers to the dataset exporting by Discorgs. THe largest online music database.
df_home is my father's sheet for keaping track. 
Some discs are not available at discorgs so they'

## Clean data

#### Drop unused columns and replace NAs

In [45]:
df_discorgs.drop(['Catalog#', 'Rating', 'release_id', 'CollectionFolder', 'Collection Media Condition', 'Collection Sleeve Condition','Collection Notes'],axis=1,inplace=True)

In [5]:
# df_discorgs['Released'].replace(0,"NS",inplace=True)

In [6]:
df_discorgs.shape,df_home.shape

((3424, 6), (3388, 26))

In [7]:
df_home.drop('CAT#', axis=1 , inplace=True)
df_home.drop(list(df_home.filter(regex = 'Unnamed')), axis = 1, inplace = True)
#df_home.fillna('NS', inplace=True)

In [8]:
def trim_all_columns(df):
    trim_strings = lambda x: x.strip() if isinstance(x, str) else x
    return df.applymap(trim_strings)
df_discorgs = trim_all_columns(df_discorgs)
df_home = trim_all_columns(df_home)

In [9]:
df_discorgs.head()

Unnamed: 0,Artist,Title,Label,Format,Released,Date Added
0,Baxter Dury,The Night Chancers,"[PIAS] Le Label, [pias]","LP, Album, Ltd, Cry",2020,2021-12-24 08:03:27
1,Gary Moore,Victims Of The Future,10 Records,"LP, Album",1984,2021-12-28 08:06:58
2,Maxi Priest,Bonafide,10 Records,"LP, Album",1990,2020-06-12 19:31:31
3,Inner City,Ain't Nobody Better,"10 Records, Virgin","12"", Single",1989,2020-06-08 12:29:11
4,Melodie Mc,I Wanna Dance,12INC,"12"", Maxi",1993,2020-06-12 20:03:57


In [10]:
df_discorgs.shape

(3424, 6)

In [11]:
df_home.head()

Unnamed: 0,Title,Artist,Released,Genre,Country,Label
0,02:00:00,"Bazaar (6), Istanbul Express",1984,Folk,Greece,Virgin
1,1,Θέμης Αδαμαντίδης,1988,Laïkó,Greece,DPI Athenaeum
2,2,"Black Heart Procession,The",2014,Indie Rock,USA,Touch And Go
3,1234,Propaganda,1990,Pop Rock,Greece,Virgin
4,90125,Yes,1983,Rock,Greece,ATCO Records


In [12]:
df_home.shape

(3388, 6)

## Merge dataframes

In [13]:
df_merged = df_discorgs.merge(df_home,on='Title', how='inner',suffixes=('', '_y'))

In [14]:
df_merged

Unnamed: 0,Artist,Title,Label,Format,Released,Date Added,Artist_y,Released_y,Genre,Country,Label_y
0,Baxter Dury,The Night Chancers,"[PIAS] Le Label, [pias]","LP, Album, Ltd, Cry",2020,2021-12-24 08:03:27,Baxter Dury,2020,Alternative Rock,Europe,[PIAS] Le Label
1,Gary Moore,Victims Of The Future,10 Records,"LP, Album",1984,2021-12-28 08:06:58,Gary Moore,1984,Rock,Greece,10 Records
2,Maxi Priest,Bonafide,10 Records,"LP, Album",1990,2020-06-12 19:31:31,Maxi Priest ‎,1990,Pop Rap,Greece,10 Records
3,Inner City,Ain't Nobody Better,"10 Records, Virgin","12"", Single",1989,2020-06-08 12:29:11,Inner City,1989,House,Greece,Virgin
4,Melodie Mc,I Wanna Dance,12INC,"12"", Maxi",1993,2020-06-12 20:03:57,Melodie MC,1993,Euro House,Sweden,12INC
...,...,...,...,...,...,...,...,...,...,...,...
3588,Manos Hadjidakis,Sweet Movie,"Πολύτροπον, Lyra","LP, Album",1974,2020-05-31 08:31:18,Μάνος Χατζιδάκις,1974,Éntekhno,Greece,Πολύτροπον
3589,Георги Димитров,Τελικός Λόγος Στη Δίκη Της Λειψίας-1933,Балкантон,"LP, Mono",0,2020-05-30 14:01:20,Γκεόργκι Ντιμιτρόφ,,Non-Music,Bulgaria,Балкантон
3590,Various,Soviet Melodies,Мелодия,"LP, Comp, Exp",1972,2021-08-15 14:55:21,Various,1972,Classical,USSR,Мелодия
3591,Нани Брегвадзе,Nani Bregvadze,Мелодия,LP,0,2020-06-13 17:03:30,Nani Bregvadze,1971,Ballad,USSR,Мелодия


In [15]:
df_merged.shape

(3593, 11)

#### Drop duplicated columns and rearrange columns

In [16]:
df_merged.drop(df_merged.filter(regex='_y$').columns.tolist(),axis=1, inplace=True)
cols = [ 'Title','Artist', 'Genre', 'Released', 'Country',
 'Label',
 'Format',
 'Date Added',
]
df_merged = df_merged.reindex(columns=cols)

#### Check for duplicated rows and drop

In [17]:
df_merged.shape

(3593, 8)

In [18]:
df_merged.duplicated(subset=['Title','Artist', 'Genre', 'Released', 'Country',
 'Label',
 'Format']).sum()

66

> 66 discs were inserted twice by accident. *whoosh*

In [19]:
dups = df_merged.loc[df_merged.duplicated(subset=['Title','Artist', 'Genre',
                                           'Released', 'Country',
                                           'Label','Format'],keep=False), :]

> keep duplicates to certify errors

In [20]:
df_merged.drop_duplicates(subset=['Title','Artist', 'Genre',
                                  'Released', 'Country',
                                  'Label','Format'], keep='first', inplace=True)
df_merged.reset_index(drop=True,inplace=True)

In [21]:
df_merged.shape

(3527, 8)

In [22]:
df_merged

Unnamed: 0,Title,Artist,Genre,Released,Country,Label,Format,Date Added
0,The Night Chancers,Baxter Dury,Alternative Rock,2020,Europe,"[PIAS] Le Label, [pias]","LP, Album, Ltd, Cry",2021-12-24 08:03:27
1,Victims Of The Future,Gary Moore,Rock,1984,Greece,10 Records,"LP, Album",2021-12-28 08:06:58
2,Bonafide,Maxi Priest,Pop Rap,1990,Greece,10 Records,"LP, Album",2020-06-12 19:31:31
3,Ain't Nobody Better,Inner City,House,1989,Greece,"10 Records, Virgin","12"", Single",2020-06-08 12:29:11
4,I Wanna Dance,Melodie Mc,Euro House,1993,Sweden,12INC,"12"", Maxi",2020-06-12 20:03:57
...,...,...,...,...,...,...,...,...
3522,Sweet Movie,Manos Hadjidakis,Éntekhno,1974,Greece,"Πολύτροπον, Lyra","LP, Album",2020-05-31 08:31:18
3523,Τελικός Λόγος Στη Δίκη Της Λειψίας-1933,Георги Димитров,Non-Music,0,Bulgaria,Балкантон,"LP, Mono",2020-05-30 14:01:20
3524,Soviet Melodies,Various,Classical,1972,USSR,Мелодия,"LP, Comp, Exp",2021-08-15 14:55:21
3525,Nani Bregvadze,Нани Брегвадзе,Ballad,0,USSR,Мелодия,LP,2020-06-13 17:03:30


## Export to excel file

In [23]:
df_merged.to_excel('gopera_collection.xlsx', index=False)