### Merge and clean datasets

In [1]:
import pandas as pd

#### Load the dataframes

In [2]:
def load_data():
    data = pd.read_excel('data/data.xlsx')
    return data

In [4]:
df = load_data()

In [5]:
df

Unnamed: 0,? (Question Mark) & The Mysterians,96 Tears,Vinyl 45,1966,Rock,Germany,19 252 AT,Ariola,Unnamed: 8,https://www.discogs.com/release/2271800--Question-Mark-The-Mysterians-96-Tears,Unnamed: 10,Unnamed: 11
0,"""Boyan"" Russian Folk Orchestra*",Barynya,Vinyl 33,1980,Folk,USSR,C20---12949-50,Мелодия,,https://www.discogs.com/release/4777841-Boyan-...,,
1,"""D"" Train",You're The One For Me,Vinyl 33,1988,Disco,Canada,SPEC 1237,Prelude Records,,https://www.discogs.com/release/1814321-D-Trai...,,
2,"""Salad Days"" Original Cast",Selection Of The Songs From Salad Days,Vinyl 33,1954,Soundtrack,UK,MG 20004,Oriole,,https://www.discogs.com/release/3757702-Salad-...,,
3,(pH)2,Jeronymo,Vinyl 33,1989,Pop,Greece,10350,Music Box International,,https://www.discogs.com/release/5132510-pH2-Je...,,
4,10 c.c.*,Silly Love,Vinyl 45,1974,Pop Rock,UK,UK 77,UK Records,,https://www.discogs.com/release/7486955-10-cc-...,,
...,...,...,...,...,...,...,...,...,...,...,...,...
6228,П. Чайковский*,Симфония № 6 / Symphony No. 6,Vinyl 33,1988,Classical,USSR,C10-16225-6,Мелодия,,https://www.discogs.com/release/14425596-%D0%9...,,
6229,"П. Чайковский* = P. Tchaikovsky*, Государствен...",Литургия,Vinyl 33,1990,Classical,USSR,А10 00637 005,Мелодия,,https://www.discogs.com/release/5337445-%D0%9F...,,
6230,Русский Сувенир,Russian Souvenir,Vinyl 45,1982,Folk,USSR,С92 — 17619-20,Мелодия,,https://www.discogs.com/release/5019118-%D0%A0...,https://www.discogs.com/release/5019118-%D0%A0...,
6231,"Сергей Прокофьев* - Ицхак Перльман*, BBC Symph...",Concertos For Violin And Orchestra No. 1 And 2,Vinyl 33,1983,Classical,USSR,A10 00021 002,Мелодия,,https://www.discogs.com/release/3948458-%D0%A1...,https://www.discogs.com/release/3948458-%D0%A1...,


## Clean data

#### Drop unused columns and replace NAs

In [4]:
df.drop('CAT#', axis=1 , inplace=True)
df.drop(list(df.filter(regex = 'Unnamed')), axis = 1, inplace = True)

In [5]:
def trim_all_columns(df):
    trim_strings = lambda x: x.strip() if isinstance(x, str) else x
    return df.applymap(trim_strings)
cols = [ 'Title','Artist','Type','Genre','Released', 'Country','Label']
df = trim_all_columns(df).reindex(columns=cols)


In [6]:
df['Released'] = df['Released'].fillna(0).astype(int)

In [7]:
df

Unnamed: 0,Title,Artist,Type,Genre,Released,Country,Label
0,96 Tears,? (Question Mark) & The Mysterians,Vinyl 45,Rock,1966,Germany,Ariola
1,You're The One For Me,"""D"" Train",Vinyl 33,Disco,1988,Canada,Prelude Records
2,Play The Blues,101 Strings,Vinyl 33,Blues,1958,USA,Pye Golden Guinea Records
3,The Soul Of Spain,101 Strings,Vinyl 33,Classical,1959,UK,Pye Golden Guinea Records
4,10 CC's Greatest Hits 1972-1978,10cc,Vinyl 33,Art Rock,1979,Greece,Mercury
...,...,...,...,...,...,...,...
3629,Romance,Zamfir,Vinyl 33,Easy Listening,1982,Greece,Philips
3630,Classics By Candlelight,Zamfir / van Hoof Orchestra,Vinyl 33,Easy Listening,1980,UK,Philips
3631,Alte Märsche,Zentrales Orchester Des Ministeriums Des Innern,Vinyl 33,Marches,1966,GDR,ETERNA
3632,The Best Of ZZ Top,ZZ Top,Vinyl 33,Blues Rock,1977,Greece,Warner Bros


In [8]:
df.head()

Unnamed: 0,Title,Artist,Type,Genre,Released,Country,Label
0,96 Tears,? (Question Mark) & The Mysterians,Vinyl 45,Rock,1966,Germany,Ariola
1,You're The One For Me,"""D"" Train",Vinyl 33,Disco,1988,Canada,Prelude Records
2,Play The Blues,101 Strings,Vinyl 33,Blues,1958,USA,Pye Golden Guinea Records
3,The Soul Of Spain,101 Strings,Vinyl 33,Classical,1959,UK,Pye Golden Guinea Records
4,10 CC's Greatest Hits 1972-1978,10cc,Vinyl 33,Art Rock,1979,Greece,Mercury


In [9]:
df.shape

(3634, 7)

#### Check for duplicated rows and drop

In [10]:
df.duplicated(subset=['Title','Artist', 'Genre', 'Released', 'Country',
 'Label','Type']).sum()

3

In [11]:
dups = df.loc[df.duplicated(subset=['Title','Artist', 'Genre',
                                           'Released', 'Country',
                                           'Label','Type'],keep=False), :]

In [12]:
dups

Unnamed: 0,Title,Artist,Type,Genre,Released,Country,Label
1445,"Mad, Bad And Dangerous To Know",Dead Or Alive,Vinyl 33,Synth-pop,1986,Greece,Epic
1446,"Mad, Bad And Dangerous To Know",Dead Or Alive,Vinyl 33,Synth-pop,1986,Greece,Epic
1447,Out Of The Blue,Debbie Gibson,Vinyl 33,Synth-pop,1987,Greece,Atlantic
1448,Out Of The Blue,Debbie Gibson,Vinyl 33,Synth-pop,1987,Greece,Atlantic
3282,Η Ελλάδα Τραγουδάει..,Various,Vinyl 33,Folk,1979,Greece,Columbia
3283,Η Ελλάδα Τραγουδάει..,Various,Vinyl 33,Folk,1979,Greece,Columbia


> keep duplicates to certify errors

In [13]:
df.drop_duplicates(subset=['Title','Artist', 'Genre',
                                  'Released', 'Country',
                                  'Label','Type'], keep='first', inplace=True)
df.reset_index(drop=True,inplace=True)

In [14]:
df.shape

(3631, 7)

In [15]:
df

Unnamed: 0,Title,Artist,Type,Genre,Released,Country,Label
0,96 Tears,? (Question Mark) & The Mysterians,Vinyl 45,Rock,1966,Germany,Ariola
1,You're The One For Me,"""D"" Train",Vinyl 33,Disco,1988,Canada,Prelude Records
2,Play The Blues,101 Strings,Vinyl 33,Blues,1958,USA,Pye Golden Guinea Records
3,The Soul Of Spain,101 Strings,Vinyl 33,Classical,1959,UK,Pye Golden Guinea Records
4,10 CC's Greatest Hits 1972-1978,10cc,Vinyl 33,Art Rock,1979,Greece,Mercury
...,...,...,...,...,...,...,...
3626,Romance,Zamfir,Vinyl 33,Easy Listening,1982,Greece,Philips
3627,Classics By Candlelight,Zamfir / van Hoof Orchestra,Vinyl 33,Easy Listening,1980,UK,Philips
3628,Alte Märsche,Zentrales Orchester Des Ministeriums Des Innern,Vinyl 33,Marches,1966,GDR,ETERNA
3629,The Best Of ZZ Top,ZZ Top,Vinyl 33,Blues Rock,1977,Greece,Warner Bros


## Export to excel file

In [16]:
df.to_excel('clean_data.xlsx', index=False)