# 3. Refine the Data
 
> "Data is messy"

We will be performing the following operation on our Onion price to refine it
- **Remove** e.g. remove redundant data from the data frame
- **Derive** e.g. State and City from the market field
- **Parse** e.g. extract date from year and month column

Other stuff you may need to do to refine are...
- **Missing** e.g. Check for missing or incomplete data
- **Quality** e.g. Check for duplicates, accuracy, unusual data

In [1]:
# Import the two library we need, which is Pandas and Numpy
import pandas as pd
import numpy as np

In [2]:
# Read the csv file of Month Wise Market Arrival data that has been scraped.
df = pd.read_csv('MonthWiseMarketArrivals.csv')

In [3]:
df.head()

Unnamed: 0,market,month,year,quantity,priceMin,priceMax,priceMod
0,ABOHAR(PB),January,2005,2350,404,493,446
1,ABOHAR(PB),January,2006,900,487,638,563
2,ABOHAR(PB),January,2010,790,1283,1592,1460
3,ABOHAR(PB),January,2011,245,3067,3750,3433
4,ABOHAR(PB),January,2012,1035,523,686,605


In [4]:
df.tail()

Unnamed: 0,market,month,year,quantity,priceMin,priceMax,priceMod
11678,YEOLA(MS),December,2013,215883,472,1427,1177
11679,YEOLA(MS),December,2014,201077,446,1654,1456
11680,YEOLA(MS),December,2015,223315,609,1446,1126
11681,YEOLA(MS),December,2016,214937,256,753,634
11682,,,Total,862952226,636(Avg),1187(Avg),962(Avg)


## REMOVE the redundant data

In [5]:
df.dtypes

market      object
month       object
year        object
quantity     int64
priceMin    object
priceMax    object
priceMod    object
dtype: object

In [6]:
# Delete the last row from the dataframe
df.tail(1)

Unnamed: 0,market,month,year,quantity,priceMin,priceMax,priceMod
11682,,,Total,862952226,636(Avg),1187(Avg),962(Avg)


In [7]:
# Delete a row from the dataframe
df.drop(df.tail(1).index, inplace = True)

In [8]:
df.head()

Unnamed: 0,market,month,year,quantity,priceMin,priceMax,priceMod
0,ABOHAR(PB),January,2005,2350,404,493,446
1,ABOHAR(PB),January,2006,900,487,638,563
2,ABOHAR(PB),January,2010,790,1283,1592,1460
3,ABOHAR(PB),January,2011,245,3067,3750,3433
4,ABOHAR(PB),January,2012,1035,523,686,605


In [9]:
df.tail()

Unnamed: 0,market,month,year,quantity,priceMin,priceMax,priceMod
11677,YEOLA(MS),December,2012,207066,485,1327,1136
11678,YEOLA(MS),December,2013,215883,472,1427,1177
11679,YEOLA(MS),December,2014,201077,446,1654,1456
11680,YEOLA(MS),December,2015,223315,609,1446,1126
11681,YEOLA(MS),December,2016,214937,256,753,634


In [10]:
df.dtypes

market      object
month       object
year        object
quantity     int64
priceMin    object
priceMax    object
priceMod    object
dtype: object

In [11]:
df.iloc[:,4:7].head()

Unnamed: 0,priceMin,priceMax,priceMod
0,404,493,446
1,487,638,563
2,1283,1592,1460
3,3067,3750,3433
4,523,686,605


In [12]:
df.iloc[:,2:7] = df.iloc[:,2:7].astype(int)

In [13]:
df.dtypes

market      object
month       object
year         int64
quantity     int64
priceMin     int64
priceMax     int64
priceMod     int64
dtype: object

In [14]:
df.head()

Unnamed: 0,market,month,year,quantity,priceMin,priceMax,priceMod
0,ABOHAR(PB),January,2005,2350,404,493,446
1,ABOHAR(PB),January,2006,900,487,638,563
2,ABOHAR(PB),January,2010,790,1283,1592,1460
3,ABOHAR(PB),January,2011,245,3067,3750,3433
4,ABOHAR(PB),January,2012,1035,523,686,605


In [15]:
df.describe()

Unnamed: 0,year,quantity,priceMin,priceMax,priceMod
count,11682.0,11682.0,11682.0,11682.0,11682.0
mean,2009.900531,73870.25,636.061804,1186.928009,962.210923
std,4.708927,124106.4,643.968747,936.2749,780.775832
min,1996.0,8.0,16.0,130.0,80.0
25%,2006.0,7936.75,215.0,593.25,470.0
50%,2010.0,25973.5,445.0,928.0,745.0
75%,2014.0,83300.0,825.0,1450.0,1191.0
max,2017.0,1974018.0,6000.0,12000.0,7800.0


## Check for MISSING VALUES

In [17]:
df.isnull().head()

Unnamed: 0,market,month,year,quantity,priceMin,priceMax,priceMod
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False


In [20]:
df.isnull().describe()

Unnamed: 0,market,month,year,quantity,priceMin,priceMax,priceMod
count,11682,11682,11682,11682,11682,11682,11682
unique,1,1,1,1,1,1,1
top,False,False,False,False,False,False,False
freq,11682,11682,11682,11682,11682,11682,11682


## DERIVE the states from market names

In [22]:
df.market.value_counts().head()

LASALGAON(MS)     253
PIMPALGAON(MS)    235
MANMAD(MS)        229
LONAND(MS)        222
MAHUVA(GUJ)       221
Name: market, dtype: int64

In [23]:
df['state'] = df.market.str.split('(').str[-1]

In [24]:
df.head()

Unnamed: 0,market,month,year,quantity,priceMin,priceMax,priceMod,state
0,ABOHAR(PB),January,2005,2350,404,493,446,PB)
1,ABOHAR(PB),January,2006,900,487,638,563,PB)
2,ABOHAR(PB),January,2010,790,1283,1592,1460,PB)
3,ABOHAR(PB),January,2011,245,3067,3750,3433,PB)
4,ABOHAR(PB),January,2012,1035,523,686,605,PB)


In [25]:
df['city'] = df.market.str.split('(').str[0]

In [26]:
df.head()

Unnamed: 0,market,month,year,quantity,priceMin,priceMax,priceMod,state,city
0,ABOHAR(PB),January,2005,2350,404,493,446,PB),ABOHAR
1,ABOHAR(PB),January,2006,900,487,638,563,PB),ABOHAR
2,ABOHAR(PB),January,2010,790,1283,1592,1460,PB),ABOHAR
3,ABOHAR(PB),January,2011,245,3067,3750,3433,PB),ABOHAR
4,ABOHAR(PB),January,2012,1035,523,686,605,PB),ABOHAR


In [27]:
df.state.unique()

array(['PB)', 'UP)', 'GUJ)', 'MS)', 'OR)', 'RAJ)', 'WB)', 'BANGALORE',
       'KNT)', 'BHOPAL', 'BHR)', 'Telangana)', 'BULANDSHAHR', 'KER)',
       'CHANDIGARH', 'CHENNAI', 'bellary)', 'podisu)', 'UTT)', 'DELHI',
       'Others)', 'MP)', 'TN)', 'Podis', 'HR)', 'TELANGANA)', 'GUWAHATI',
       'AS)', 'HYDERABAD', 'IMPHAL', 'JAIPUR', 'WHITE)', 'JAMMU',
       'KOLKATA', 'HP)', 'AP)', 'LUCKNOW', 'MUMBAI', 'NAGPUR', 'PATNA',
       'M.P.)', 'RJ)', 'CHATT)', 'CHGARH)', 'JH)', 'SHAHJAHANPUR',
       'SHIMLA', 'SRINAGAR', 'TRIVENDRUM'], dtype=object)

In [28]:
df['state'] = df.state.str.split(')').str[0]

In [33]:
df.state.unique()

array(['PB', 'UP', 'GUJ', 'MS', 'OR', 'RAJ', 'WB', 'BANGALORE', 'KNT',
       'BHOPAL', 'BHR', 'Telangana', 'BULANDSHAHR', 'KER', 'CHANDIGARH',
       'CHENNAI', 'bellary', 'podisu', 'UTT', 'DELHI', 'Others', 'MP',
       'TN', 'Podis', 'HR', 'TELANGANA', 'GUWAHATI', 'AS', 'HYDERABAD',
       'IMPHAL', 'JAIPUR', 'WHITE', 'JAMMU', 'KOLKATA', 'HP', 'AP',
       'LUCKNOW', 'MUMBAI', 'NAGPUR', 'PATNA', 'M.P.', 'RJ', 'CHATT',
       'CHGARH', 'JH', 'SHAHJAHANPUR', 'SHIMLA', 'SRINAGAR', 'TRIVENDRUM'], dtype=object)

In [30]:
dfState = df.groupby(['state', 'market'], as_index=False).count()

In [31]:
dfState.market.unique()

array(['KURNOOL(AP)', 'MADANAPALLE (AP)', 'MULAKALACHERUVU (AP)',
       'MYDUKUR(AP)', 'PALAMANER (AP)', 'RAJAHMUNDRY(AP)',
       'SADASIVPET(AP)', 'HOWLY (AS)', 'BANGALORE', 'BHOPAL',
       'BIHARSHARIF(BHR)', 'BULANDSHAHR', 'CHANDIGARH', 'RAIGARH (CHATT)',
       'TIPHRA (CHATT)', 'CHENNAI', 'RAIPUR(CHGARH)', 'DELHI',
       'AHMEDABAD(GUJ)', 'BHAVNAGAR(GUJ)', 'DEESA(GUJ)', 'GONDAL(GUJ)',
       'JAMNAGAR(GUJ)', 'MAHUVA(GUJ)', 'RAJKOT(GUJ)', 'SURAT(GUJ)',
       'VADODARA(GUJ)', 'GUWAHATI', 'KULLU(HP)', 'SOLAN (HP)',
       'FARIDABAD (HR)', 'GURGAON(HR)', 'KARNAL(HR)', 'HYDERABAD',
       'IMPHAL', 'JAIPUR', 'JAMMU', 'RANCHI(JH)', 'CHALA (KER)',
       'PALAYAM(KER)', 'BELGAUM(KNT)', 'BIJAPUR(KNT)', 'BINNY MILL (KNT)',
       'CHALLAKERE(KNT)', 'CHICKBALLAPUR(KNT)', 'DHAVANGERE(KNT)',
       'GUNDLUPET (KNT)', 'HASSAN(KNT)', 'HUBLI(KNT)', 'KOLAR(KNT)',
       'MYSORE -BANDIPALYA (KNT)', 'RAICHUR(KNT)', 'SRINIVASAPUR (KNT)',
       'SRINIVASPUR (KNT)', 'KOLKATA', 'LUCKNOW', 'PIPLY

In [32]:
df.state.unique()

array(['PB', 'UP', 'GUJ', 'MS', 'OR', 'RAJ', 'WB', 'BANGALORE', 'KNT',
       'BHOPAL', 'BHR', 'Telangana', 'BULANDSHAHR', 'KER', 'CHANDIGARH',
       'CHENNAI', 'bellary', 'podisu', 'UTT', 'DELHI', 'Others', 'MP',
       'TN', 'Podis', 'HR', 'TELANGANA', 'GUWAHATI', 'AS', 'HYDERABAD',
       'IMPHAL', 'JAIPUR', 'WHITE', 'JAMMU', 'KOLKATA', 'HP', 'AP',
       'LUCKNOW', 'MUMBAI', 'NAGPUR', 'PATNA', 'M.P.', 'RJ', 'CHATT',
       'CHGARH', 'JH', 'SHAHJAHANPUR', 'SHIMLA', 'SRINAGAR', 'TRIVENDRUM'], dtype=object)

In [34]:
state_now = ['PB', 'UP', 'GUJ', 'MS', 'OR', 'RAJ', 'WB', 'BANGALORE', 'KNT',
       'BHOPAL', 'BHR', 'Telangana', 'BULANDSHAHR', 'KER', 'CHANDIGARH',
       'CHENNAI', 'bellary', 'podisu', 'UTT', 'DELHI', 'Others', 'MP',
       'TN', 'Podis', 'HR', 'TELANGANA', 'GUWAHATI', 'AS', 'HYDERABAD',
       'IMPHAL', 'JAIPUR', 'WHITE', 'JAMMU', 'KOLKATA', 'HP', 'AP',
       'LUCKNOW', 'MUMBAI', 'NAGPUR', 'PATNA', 'M.P.', 'RJ', 'CHATT',
       'CHGARH', 'JH', 'SHAHJAHANPUR', 'SHIMLA', 'SRINAGAR', 'TRIVENDRUM']

In [35]:
state_new =['PB', 'UP', 'GJ', 'MH', 'OR', 'RJ', 'WB', 'KT', 'KT',
       'MP', 'BH', 'TL', 'UP', 'KR', 'CG',
       'TN', 'TN', 'TN', 'UP', 'DL', 'DL', 'MP',
       'TN', 'TN', 'HR', 'TEL', 'ASM', 'AS', 'AP',
       'MG', 'RJ', 'MH', 'JK', 'WB', 'HP', 'AP',
       'UP', 'MH', 'MH', 'BH', 'MP', 'RJ', 'CH',
       'CG', 'JH', 'UP', 'HP', 'JK', 'KR']

In [36]:
df.state = df.state.replace(state_now, state_new)

In [37]:
df.state.unique()

array(['PB', 'UP', 'GJ', 'MH', 'OR', 'RJ', 'WB', 'KT', 'MP', 'BH', 'TL',
       'KR', 'CG', 'TN', 'DL', 'HR', 'TEL', 'ASM', 'AS', 'AP', 'MG', 'JK',
       'HP', 'CH', 'JH'], dtype=object)

## PARSE TO get the Dates

In [38]:
df.head()

Unnamed: 0,market,month,year,quantity,priceMin,priceMax,priceMod,state,city
0,ABOHAR(PB),January,2005,2350,404,493,446,PB,ABOHAR
1,ABOHAR(PB),January,2006,900,487,638,563,PB,ABOHAR
2,ABOHAR(PB),January,2010,790,1283,1592,1460,PB,ABOHAR
3,ABOHAR(PB),January,2011,245,3067,3750,3433,PB,ABOHAR
4,ABOHAR(PB),January,2012,1035,523,686,605,PB,ABOHAR


In [39]:
df.index

Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                9,
            ...
            11672, 11673, 11674, 11675, 11676, 11677, 11678, 11679, 11680,
            11681],
           dtype='int64', length=11682)

In [40]:
pd.to_datetime('January 2012')

Timestamp('2012-01-01 00:00:00')

In [41]:
df['date'] = df['month'] + '-' + df['year'].map(str)

In [82]:
??map

In [42]:
df.head()

Unnamed: 0,market,month,year,quantity,priceMin,priceMax,priceMod,state,city,date
0,ABOHAR(PB),January,2005,2350,404,493,446,PB,ABOHAR,January-2005
1,ABOHAR(PB),January,2006,900,487,638,563,PB,ABOHAR,January-2006
2,ABOHAR(PB),January,2010,790,1283,1592,1460,PB,ABOHAR,January-2010
3,ABOHAR(PB),January,2011,245,3067,3750,3433,PB,ABOHAR,January-2011
4,ABOHAR(PB),January,2012,1035,523,686,605,PB,ABOHAR,January-2012


In [43]:
index = pd.to_datetime(df.date)

In [44]:
df.index = pd.PeriodIndex(df.date, freq='M')

In [45]:
df.columns

Index(['market', 'month', 'year', 'quantity', 'priceMin', 'priceMax',
       'priceMod', 'state', 'city', 'date'],
      dtype='object')

In [46]:
df.index

PeriodIndex(['2005-01', '2006-01', '2010-01', '2011-01', '2012-01', '2013-01',
             '2014-01', '2015-01', '2017-01', '2005-02',
             ...
             '2007-12', '2008-12', '2009-12', '2010-12', '2011-12', '2012-12',
             '2013-12', '2014-12', '2015-12', '2016-12'],
            dtype='period[M]', name='date', length=11682, freq='M')

In [47]:
df.head()

Unnamed: 0_level_0,market,month,year,quantity,priceMin,priceMax,priceMod,state,city,date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2005-01,ABOHAR(PB),January,2005,2350,404,493,446,PB,ABOHAR,January-2005
2006-01,ABOHAR(PB),January,2006,900,487,638,563,PB,ABOHAR,January-2006
2010-01,ABOHAR(PB),January,2010,790,1283,1592,1460,PB,ABOHAR,January-2010
2011-01,ABOHAR(PB),January,2011,245,3067,3750,3433,PB,ABOHAR,January-2011
2012-01,ABOHAR(PB),January,2012,1035,523,686,605,PB,ABOHAR,January-2012


In [48]:
df.to_csv('MonthWiseMarketArrivals_Clean.csv', index = False)