# Merge and clean datasets

In [138]:
import pandas as pd

#### Load the dataframes

In [139]:
def load_data():
    df1 = pd.read_csv('gopera-collection-20220126-0742.csv')
    df2 = pd.read_excel('all_discs20220126.xlsx')
    return df1, df2

In [140]:
df_discorgs, df_home = load_data()

> df_discorgs refers to the dataset exporting by Discorgs. THe largest online music database.
df_home is my father's sheet for keaping track. 
Some discs are not available at discorgs so they'

## Clean data

#### Drop unused columns and replace NAs

In [141]:
df_discorgs.drop(['Catalog#', 'Rating', 'release_id', 'CollectionFolder', 'Collection Media Condition', 'Collection Sleeve Condition','Collection Notes'],axis=1,inplace=True)

In [142]:
# df_discorgs['Released'].replace(0,"NS",inplace=True)

In [143]:
df_discorgs.shape,df_home.shape

((3556, 6), (3579, 26))

In [144]:
df_home.drop('CAT#', axis=1 , inplace=True)
df_home.drop(list(df_home.filter(regex = 'Unnamed')), axis = 1, inplace = True)
#df_home.fillna('NS', inplace=True)

In [145]:
def trim_all_columns(df):
    trim_strings = lambda x: x.strip() if isinstance(x, str) else x
    return df.applymap(trim_strings)
df_discorgs = trim_all_columns(df_discorgs)
df_home = trim_all_columns(df_home)

In [146]:
df_discorgs.head()

Unnamed: 0,Artist,Title,Label,Format,Released,Date Added
0,Baxter Dury,The Night Chancers,"[PIAS] Le Label, [pias]","LP, Album, Ltd, Cry",2020,2021-12-24 08:03:27
1,Gary Moore,Victims Of The Future,10 Records,"LP, Album",1984,2021-12-28 08:06:58
2,Maxi Priest,Bonafide,10 Records,"LP, Album",1990,2020-06-12 19:31:31
3,Inner City,Ain't Nobody Better,"10 Records, Virgin","12"", Single",1989,2020-06-08 12:29:11
4,Melodie Mc,I Wanna Dance,12INC,"12"", Maxi",1993,2020-06-12 20:03:57


In [147]:
df_discorgs.shape

(3556, 6)

In [148]:
df_home.head()

Unnamed: 0,Artist,Title,Released,Genre,Country,Label,Type
0,? (Question Mark) & The Mysterians,96 Tears,1966.0,Rock,Germany,Ariola,Vinyl 45
1,"""D"" Train",You're The One For Me,1988.0,Disco,Canada,Prelude Records,Vinyl 33
2,101 Strings,Play The Blues,1958.0,Blues,USA,Pye Golden Guinea Records,Vinyl 33
3,101 Strings,The Soul Of Spain,1959.0,Classical,UK,Pye Golden Guinea Records,Vinyl 33
4,10cc,10 CC's Greatest Hits 1972-1978,1979.0,Art Rock,Greece,Mercury,Vinyl 33


In [149]:
df_home.shape

(3579, 7)

## Merge dataframes

In [150]:
df_merged = df_discorgs.merge(df_home,on='Title', how='inner',suffixes=('', '_y'))

In [151]:
df_merged

Unnamed: 0,Artist,Title,Label,Format,Released,Date Added,Artist_y,Released_y,Genre,Country,Label_y,Type
0,Baxter Dury,The Night Chancers,"[PIAS] Le Label, [pias]","LP, Album, Ltd, Cry",2020,2021-12-24 08:03:27,Baxter Dury,2020.0,Alternative Rock,Europe,[PIAS] Le Label,Vinyl 33
1,Gary Moore,Victims Of The Future,10 Records,"LP, Album",1984,2021-12-28 08:06:58,Gary Moore,1984.0,Rock,Greece,10 Records,Vinyl 33
2,Maxi Priest,Bonafide,10 Records,"LP, Album",1990,2020-06-12 19:31:31,Maxi Priest ‎,1990.0,Pop Rap,Greece,10 Records,Vinyl 33
3,Inner City,Ain't Nobody Better,"10 Records, Virgin","12"", Single",1989,2020-06-08 12:29:11,Inner City,1989.0,House,Greece,Virgin,Vinyl 33
4,Melodie Mc,I Wanna Dance,12INC,"12"", Maxi",1993,2020-06-12 20:03:57,Melodie MC,1993.0,Euro House,Sweden,12INC,Vinyl 33
...,...,...,...,...,...,...,...,...,...,...,...,...
3781,Георги Димитров,Τελικός Λόγος Στη Δίκη Της Λειψίας-1933,Балкантон,"LP, Mono",0,2020-05-30 14:01:20,Γκεόργκι Ντιμιτρόφ,,Non-Music,Bulgaria,Балкантон,Vinyl 33
3782,Лили Иванова,Камино,Балкантон,"LP, Album, Red",1969,2022-01-16 06:45:52,Лили Иванова,1969.0,Schlager,Bulgaria,Балкантон,Vinyl 33
3783,Various,Soviet Melodies,Мелодия,"LP, Comp, Exp",1972,2021-08-15 14:55:21,Various,1972.0,Classical,USSR,Мелодия,Vinyl 33
3784,Нани Брегвадзе,Nani Bregvadze,Мелодия,LP,0,2020-06-13 17:03:30,Nani Bregvadze,1971.0,Ballad,USSR,Мелодия,Vinyl 33


In [152]:
df_merged.shape

(3786, 12)

#### Drop duplicated columns and rearrange columns

In [153]:
df_merged.drop(df_merged.filter(regex='_y$').columns.tolist(),axis=1, inplace=True)
cols = [ 'Title','Artist', 'Genre', 'Released', 'Country',
 'Label',
 'Format',
 'Date Added',
 'Type'
]
df_merged = df_merged.reindex(columns=cols)

#### Check for duplicated rows and drop

In [154]:
df_merged.shape

(3786, 9)

In [155]:
df_merged.duplicated(subset=['Title','Artist', 'Genre', 'Released', 'Country',
 'Label',
 'Format','Type']).sum()

68

> 68 discs were inserted twice by accident. *whoosh*

In [156]:
dups = df_merged.loc[df_merged.duplicated(subset=['Title','Artist', 'Genre',
                                           'Released', 'Country',
                                           'Label','Format','Type'],keep=False), :]

> keep duplicates to certify errors

In [157]:
df_merged.drop_duplicates(subset=['Title','Artist', 'Genre',
                                  'Released', 'Country',
                                  'Label','Format','Type'], keep='first', inplace=True)
df_merged.reset_index(drop=True,inplace=True)

In [158]:
df_merged.shape

(3718, 9)

In [159]:
df_merged

Unnamed: 0,Title,Artist,Genre,Released,Country,Label,Format,Date Added,Type
0,The Night Chancers,Baxter Dury,Alternative Rock,2020,Europe,"[PIAS] Le Label, [pias]","LP, Album, Ltd, Cry",2021-12-24 08:03:27,Vinyl 33
1,Victims Of The Future,Gary Moore,Rock,1984,Greece,10 Records,"LP, Album",2021-12-28 08:06:58,Vinyl 33
2,Bonafide,Maxi Priest,Pop Rap,1990,Greece,10 Records,"LP, Album",2020-06-12 19:31:31,Vinyl 33
3,Ain't Nobody Better,Inner City,House,1989,Greece,"10 Records, Virgin","12"", Single",2020-06-08 12:29:11,Vinyl 33
4,I Wanna Dance,Melodie Mc,Euro House,1993,Sweden,12INC,"12"", Maxi",2020-06-12 20:03:57,Vinyl 33
...,...,...,...,...,...,...,...,...,...
3713,Τελικός Λόγος Στη Δίκη Της Λειψίας-1933,Георги Димитров,Non-Music,0,Bulgaria,Балкантон,"LP, Mono",2020-05-30 14:01:20,Vinyl 33
3714,Камино,Лили Иванова,Schlager,1969,Bulgaria,Балкантон,"LP, Album, Red",2022-01-16 06:45:52,Vinyl 33
3715,Soviet Melodies,Various,Classical,1972,USSR,Мелодия,"LP, Comp, Exp",2021-08-15 14:55:21,Vinyl 33
3716,Nani Bregvadze,Нани Брегвадзе,Ballad,0,USSR,Мелодия,LP,2020-06-13 17:03:30,Vinyl 33


## Export to excel file

In [160]:
df_merged.to_excel('gopera_collection_test.xlsx', index=False)