# Merge and clean datasets

In [99]:
import pandas as pd

#### Load the dataframes

In [100]:
def load_data():
    df1 = pd.read_csv('discorgs_collection.csv')
    df2 = pd.read_excel('all_discs.xlsx')
    return df1, df2

In [101]:
df_discorgs, df_home = load_data()

## Clean data

#### Drop unused columns and replace NAs

In [102]:
df_discorgs.drop(['Catalog#', 'Rating', 'release_id', 'CollectionFolder', 'Collection Media Condition', 'Collection Sleeve Condition','Collection Notes'],axis=1,inplace=True)

In [103]:
df_home.drop('CAT#', axis=1 , inplace=True)
df_home.drop(list(df_home.filter(regex = 'Unnamed')), axis = 1, inplace = True)
df_home.fillna('No info', inplace=True)

In [170]:
df_discorgs.head()

Unnamed: 0,Artist,Title,Label,Format,Released,Date Added
0,Maxi Priest,Bonafide,10 Records,"LP, Album",1990,2020-06-12 19:31:31
1,Inner City,Ain't Nobody Better,"10 Records, Virgin","12"", Single",1989,2020-06-08 12:29:11
2,Melodie Mc,I Wanna Dance,12INC,"12"", Maxi",1993,2020-06-12 20:03:57
3,The Alan Parsons Project,Tales Of Mystery And Imagination,"20th Century Records, Philips","LP, Album, Gat",1977,2020-06-01 20:13:23
4,John Themis,Sirens,4AD,"LP, Album",1983,2020-06-10 13:19:35


In [171]:
df_discorgs.shape

(3121, 6)

In [105]:
df_home.head()

Unnamed: 0,Artist,Title,Released,Genre,Country,Label
0,? (Question Mark) & The Mysterians,96 Tears,1966,Rock,Germany,Ariola
1,2 Men A Drum Machine And A Trumpet,Tired Of Getting Pushed Around,1987,Electronic,UK,London Records
2,Α. Ρεπάνης / Χρηστάκης,Απόψε Σε Θυμήθηκα / Το Ταβερνάκι,1967,Λαικό,Greece,Sonata
3,Άκη Λυμούρη,Φεγγαράκι Μου Λαμπρό / Τσιριτρό,No info,Children's,Greece,RCA Victor
4,Αλέκος Κιτσάκης,Καταραμένο Φρούριο / Ο Κωσταντής Ο Μερακλής,1970,Λαικό,Greece,Minos


In [172]:
df_home.shape

(3390, 6)

## Merge dataframes

In [174]:
df_merged = df_discorgs.merge(df_home,on='Title', how='inner',suffixes=('', '_y'))
df_merged

Unnamed: 0,Artist,Title,Label,Format,Released,Date Added,Artist_y,Released_y,Genre,Country,Label_y
0,Maxi Priest,Bonafide,10 Records,"LP, Album",1990,2020-06-12 19:31:31,Maxi Priest ‎,1990,Pop Rap,Greece,10 Records
1,Inner City,Ain't Nobody Better,"10 Records, Virgin","12"", Single",1989,2020-06-08 12:29:11,Inner City,1989,House,Greece,Virgin
2,Melodie Mc,I Wanna Dance,12INC,"12"", Maxi",1993,2020-06-12 20:03:57,Melodie MC,1993,Euro House,Sweden,12INC
3,John Themis,Sirens,4AD,"LP, Album",1983,2020-06-10 13:19:35,John Themis,1983,Contemporary Jazz,Greece,4AD
4,The Breeders,Cannonball,4AD,"12"", Single",1993,2020-06-03 06:30:12,"Breeders,The ‎",1993,Altemative Rock,UK,4AD
...,...,...,...,...,...,...,...,...,...,...,...
3294,Manos Hadjidakis,Sweet Movie,"Πολύτροπον, Lyra","LP, Album",1974,2020-05-31 08:31:18,Μάνος Χατζιδάκις,1974,Éntekhno,Greece,Πολύτροπον
3295,Георги Димитров,Τελικός Λόγος Στη Δίκη Της Λειψίας-1933,Балкантон,"LP, Mono",0,2020-05-30 14:01:20,Γκεόργκι Ντιμιτρόφ,No info,Non-Music,Bulgaria,Балкантон
3296,Various,Soviet Melodies,Мелодия,"LP, Comp, Exp",1972,2021-08-15 14:55:21,Various,1972,Classical,USSR,Мелодия
3297,Нани Брегвадзе,Nani Bregvadze,Мелодия,LP,0,2020-06-13 17:03:30,Nani Bregvadze,1971,Ballad,USSR,Мелодия


In [175]:
df_merged.shape

(3299, 11)

#### Drop duplicated columns and rearrange columns

In [159]:
df_merged.drop(df_merged.filter(regex='_y$').columns.tolist(),axis=1, inplace=True)
cols = [ 'Title','Artist', 'Genre', 'Released', 'Country',
 'Label',
 'Format',
 'Date Added',
]
df_merged = df_merged.reindex(columns=cols)

In [160]:
df_merged.head()

Unnamed: 0,Title,Artist,Genre,Released,Country,Label,Format,Date Added
0,Bonafide,Maxi Priest,Pop Rap,1990,Greece,10 Records,"LP, Album",2020-06-12 19:31:31
1,Ain't Nobody Better,Inner City,House,1989,Greece,"10 Records, Virgin","12"", Single",2020-06-08 12:29:11
2,I Wanna Dance,Melodie Mc,Euro House,1993,Sweden,12INC,"12"", Maxi",2020-06-12 20:03:57
3,Sirens,John Themis,Contemporary Jazz,1983,Greece,4AD,"LP, Album",2020-06-10 13:19:35
4,Cannonball,The Breeders,Altemative Rock,1993,UK,4AD,"12"", Single",2020-06-03 06:30:12


#### Check for duplicated rows and drop

In [161]:
df_merged.shape

(3299, 8)

In [162]:
df_merged.duplicated().sum()

63

In [163]:
df_merged.drop_duplicates(keep='first', inplace=True)
df_merged.reset_index(drop=True,inplace=True)

In [164]:
df_merged.shape

(3236, 8)

## Export to excel file

In [165]:
df_merged.to_excel('gopera_collection.xlsx', index=False)