In [1]:
# Importing Dependencies 
%matplotlib inline
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import auc
from matplotlib import style
from matplotlib import style
from sklearn.svm import SVC 

### Read the data
df = pd.read_csv("../Resources/Household-living-cost-price-indexes-September-2022-quarter-time-series-indexes.csv")
df.head()

Unnamed: 0,hlpi_name,series_ref,quarter,hlpi,nzhec,nzhec_name,nzhec_short,level,index,change.q,change.a
0,All households,HPIQ.SALL01,2008Q2,allhh,1,Food,Food,group,870,,
1,All households,HPIQ.SALL01,2008Q3,allhh,1,Food,Food,group,903,3.8,
2,All households,HPIQ.SALL01,2008Q4,allhh,1,Food,Food,group,918,1.7,
3,All households,HPIQ.SALL01,2009Q1,allhh,1,Food,Food,group,927,1.0,
4,All households,HPIQ.SALL01,2009Q2,allhh,1,Food,Food,group,935,0.9,7.5


In [2]:
# Checks Data Info
df.info

<bound method DataFrame.info of             hlpi_name   series_ref quarter   hlpi nzhec  nzhec_name  \
0      All households  HPIQ.SALL01  2008Q2  allhh     1        Food   
1      All households  HPIQ.SALL01  2008Q3  allhh     1        Food   
2      All households  HPIQ.SALL01  2008Q4  allhh     1        Food   
3      All households  HPIQ.SALL01  2009Q1  allhh     1        Food   
4      All households  HPIQ.SALL01  2009Q2  allhh     1        Food   
...               ...          ...     ...    ...   ...         ...   
49527  Superannuitant   HPIQ.SSUPA  2021Q3  super     A  All groups   
49528  Superannuitant   HPIQ.SSUPA  2021Q4  super     A  All groups   
49529  Superannuitant   HPIQ.SSUPA  2022Q1  super     A  All groups   
49530  Superannuitant   HPIQ.SSUPA  2022Q2  super     A  All groups   
49531  Superannuitant   HPIQ.SSUPA  2022Q3  super     A  All groups   

      nzhec_short       level  index  change.q  change.a  
0            Food       group    870       NaN       NaN

In [3]:
# Looking for Unique values in the quarter column
df["quarter"].unique()

array(['2008Q2', '2008Q3', '2008Q4', '2009Q1', '2009Q2', '2009Q3',
       '2009Q4', '2010Q1', '2010Q2', '2010Q3', '2010Q4', '2011Q1',
       '2011Q2', '2011Q3', '2011Q4', '2012Q1', '2012Q2', '2012Q3',
       '2012Q4', '2013Q1', '2013Q2', '2013Q3', '2013Q4', '2014Q1',
       '2014Q2', '2014Q3', '2014Q4', '2015Q1', '2015Q2', '2015Q3',
       '2015Q4', '2016Q1', '2016Q2', '2016Q3', '2016Q4', '2017Q1',
       '2017Q2', '2017Q3', '2017Q4', '2018Q1', '2018Q2', '2018Q3',
       '2018Q4', '2019Q1', '2019Q2', '2019Q3', '2019Q4', '2020Q1',
       '2020Q2', '2020Q3', '2020Q4', '2021Q1', '2021Q2', '2021Q3',
       '2021Q4', '2022Q1', '2022Q2', '2022Q3'], dtype=object)

In [4]:
# Split the quarter column into quarter and year
df[["year", "quarter"]] = df["quarter"].str.split('Q', n=1, expand=True)
df.head()

Unnamed: 0,hlpi_name,series_ref,quarter,hlpi,nzhec,nzhec_name,nzhec_short,level,index,change.q,change.a,year
0,All households,HPIQ.SALL01,2,allhh,1,Food,Food,group,870,,,2008
1,All households,HPIQ.SALL01,3,allhh,1,Food,Food,group,903,3.8,,2008
2,All households,HPIQ.SALL01,4,allhh,1,Food,Food,group,918,1.7,,2008
3,All households,HPIQ.SALL01,1,allhh,1,Food,Food,group,927,1.0,,2009
4,All households,HPIQ.SALL01,2,allhh,1,Food,Food,group,935,0.9,7.5,2009


In [5]:
# Count of rows
df.count()

hlpi_name      49532
series_ref     49532
quarter        49532
hlpi           49532
nzhec          49532
nzhec_name     49532
nzhec_short    49532
level          49532
index          49532
change.q       48678
change.a       46116
year           49532
dtype: int64

In [6]:
# Count of null values
df.isna().sum()

hlpi_name         0
series_ref        0
quarter           0
hlpi              0
nzhec             0
nzhec_name        0
nzhec_short       0
level             0
index             0
change.q        854
change.a       3416
year              0
dtype: int64

In [7]:
# Drop Nulls then check for null values again
df = df.dropna()
df.isna().sum()

hlpi_name      0
series_ref     0
quarter        0
hlpi           0
nzhec          0
nzhec_name     0
nzhec_short    0
level          0
index          0
change.q       0
change.a       0
year           0
dtype: int64

In [8]:
# Display Data
df.head()

Unnamed: 0,hlpi_name,series_ref,quarter,hlpi,nzhec,nzhec_name,nzhec_short,level,index,change.q,change.a,year
4,All households,HPIQ.SALL01,2,allhh,1,Food,Food,group,935,0.9,7.5,2009
5,All households,HPIQ.SALL01,3,allhh,1,Food,Food,group,949,1.5,5.1,2009
6,All households,HPIQ.SALL01,4,allhh,1,Food,Food,group,929,-2.1,1.2,2009
7,All households,HPIQ.SALL01,1,allhh,1,Food,Food,group,938,1.0,1.2,2010
8,All households,HPIQ.SALL01,2,allhh,1,Food,Food,group,929,-1.0,-0.6,2010


In [11]:
# Renaming Columns 
df=df.rename(columns={"hlpi_name":"protocol id","nzhec_name":"expenditures","level":"groups","change.q":"quarterly","change.a":"annually"})
df.head()

Unnamed: 0,protocol id,series_ref,quarter,hlpi,nzhec,expenditures,nzhec_short,groups,index,quarterly,annually,year
4,All households,HPIQ.SALL01,2,allhh,1,Food,Food,group,935,0.9,7.5,2009
5,All households,HPIQ.SALL01,3,allhh,1,Food,Food,group,949,1.5,5.1,2009
6,All households,HPIQ.SALL01,4,allhh,1,Food,Food,group,929,-2.1,1.2,2009
7,All households,HPIQ.SALL01,1,allhh,1,Food,Food,group,938,1.0,1.2,2010
8,All households,HPIQ.SALL01,2,allhh,1,Food,Food,group,929,-1.0,-0.6,2010


In [12]:
# Drop extra columns
df_clean=df.drop(['hlpi', 'nzhec_short', 'nzhec'], axis=1)
df_clean.head()

Unnamed: 0,protocol id,series_ref,quarter,expenditures,groups,index,quarterly,annually,year
4,All households,HPIQ.SALL01,2,Food,group,935,0.9,7.5,2009
5,All households,HPIQ.SALL01,3,Food,group,949,1.5,5.1,2009
6,All households,HPIQ.SALL01,4,Food,group,929,-2.1,1.2,2009
7,All households,HPIQ.SALL01,1,Food,group,938,1.0,1.2,2010
8,All households,HPIQ.SALL01,2,Food,group,929,-1.0,-0.6,2010


In [15]:
# Recounting rows
rowcount = df.count()
rowcount

protocol id     46116
series_ref      46116
quarter         46116
hlpi            46116
nzhec           46116
expenditures    46116
nzhec_short     46116
groups          46116
index           46116
quarterly       46116
annually        46116
year            46116
dtype: int64

In [22]:
# Adding a unique ID column to DataFrame
df_clean['ID'] = np.arange(len(df_clean))
df_clean.head(20)

Unnamed: 0,protocol id,series_ref,quarter,expenditures,groups,index,quarterly,annually,year,ID
4,All households,HPIQ.SALL01,2,Food,group,935,0.9,7.5,2009,0
5,All households,HPIQ.SALL01,3,Food,group,949,1.5,5.1,2009,1
6,All households,HPIQ.SALL01,4,Food,group,929,-2.1,1.2,2009,2
7,All households,HPIQ.SALL01,1,Food,group,938,1.0,1.2,2010,3
8,All households,HPIQ.SALL01,2,Food,group,929,-1.0,-0.6,2010,4
9,All households,HPIQ.SALL01,3,Food,group,952,2.5,0.3,2010,5
10,All households,HPIQ.SALL01,4,Food,group,974,2.3,4.8,2010,6
11,All households,HPIQ.SALL01,1,Food,group,985,1.1,5.0,2011,7
12,All households,HPIQ.SALL01,2,Food,group,996,1.1,7.2,2011,8
13,All households,HPIQ.SALL01,3,Food,group,1013,1.7,6.4,2011,9


In [23]:
# Exporting the Clean Data
df_clean.to_csv('../Resources/Clean_Household_Data.csv', index=False)