# Load Data & Wrangle

## Libraries

In [None]:
import pandas as pd

## Data Wrangling

### Load Data

In [None]:
# read in the dataframe
df_001 = pd.read_csv("Harvey_CorrelationMatrix_v002.csv", encoding = "cp1252")

### Structure

In [None]:
print("The training set has {0} rows and {1} columns".format(df_001.shape[0], df_001.shape[1]))

In [None]:
print(df_001.head())

## Missing Values

### How many features/attributes have missing values? Which ones?

In [None]:
print("A total of ", len(df_001.columns[df_001.isnull().any()]), "features have missing values")
print("They are:", df_001.columns[df_001.isnull().any()])

### What is the percentage of missing values?

In [None]:
miss = df_001.isnull().sum()/len(df_001) #number of missing entries in each feature / number of total entries
miss = miss[miss > 0] # keep only those that are greater than "0 / number of total entries"
miss.sort_values(inplace=True) # sort by percentage ascending
miss # display

### Visualising missing values

In [None]:
miss = miss.to_frame() # convert to a dataframe
miss.columns = ['count'] # rename the column as 'count'
miss.index.names = ['Name'] # rename index as 'Name'
miss['Name'] = miss.index # create a new column of the index

#plot the missing value count
sns.set(style="whitegrid", color_codes=True)
sns.barplot(x = 'Name', y = 'count', data=miss)
plt.xticks(rotation = 90)
plt.show()

### Replace Missing Values (Tutorial_HousingPrices)

In [None]:
# creates a pandas series, with neighborhood as key and grouped values
lot_frontage_by_neighborhood = train['LotFrontage'].groupby(train['Neighborhood'])

# for key (neighbourhood) and their grouped values in pandas series
for key, group in lot_frontage_by_neighborhood:
    
    # replace any missing values (is.null) with group.median()
    idx = (alldata['Neighborhood'] == key) & (alldata['LotFrontage'].isnull())
    alldata.loc[idx, 'LotFrontage'] = group.median()

### Data Distribution

In [None]:
sns.distplot(train['SalePrice'])

print("The skewness of SalePrice is {}".format(train['SalePrice'].skew()))

print("The distribution has a right skew. Needs to be normalized.")

**Perform Log Transform**

In [None]:
target = np.log(train['SalePrice'])
print ('Skewness is', target.skew())
sns.distplot(target)

### Separate Numeric and Categorical Variables

In [None]:
numeric_data = train.select_dtypes(include=[np.number])
cat_data = train.select_dtypes(exclude=[np.number])

print ("There are {} numeric and {} categorical columns in train \
data".format(numeric_data.shape[1],cat_data.shape[1]))

### Correlogram

In [None]:
corr = numeric_data.corr()
sns.heatmap(corr, cmap="PiYG", center=0)

In [None]:
print (corr['SalePrice'].sort_values(ascending=False)[:15], '\n') #top 15 values
print ('----------------------')
print (corr['SalePrice'].sort_values(ascending=False)[-5:]) #last 5 values`

### Inspect Specific Features

Numeric Variables

In [None]:
train['OverallQual'].unique()

In [None]:
#let's check the mean price per quality and plot it.
pivot = train.pivot_table(index='GarageCars', values='GarageArea', aggfunc=np.median)
pivot

In [None]:
pivot.plot(kind='bar', color='goldenrod')

In [None]:
#GrLivArea variable
sns.jointplot(x=train['GrLivArea'], y=train['SalePrice'])

Categorical Variables

In [None]:
cat_data.describe()

In [None]:
sp_pivot = train.pivot_table(index='SaleCondition', values='SalePrice', aggfunc=np.median)
sp_pivot

In [None]:
sp_pivot.plot(kind='bar',color='teal')

ANOVA: Categorical Variables

In [None]:
cat = [f for f in train.columns if train.dtypes[f] == 'object']
def anova(frame):
    anv = pd.DataFrame()
    anv['features'] = cat
    pvals = []
    for c in cat:
           samples = []
           for cls in frame[c].unique():
                  s = frame[frame[c] == cls]['SalePrice'].values
                  samples.append(s)
           pval = stats.f_oneway(*samples)[1]
           pvals.append(pval)
    anv['pval'] = pvals
    return anv.sort_values('pval')

cat_data['SalePrice'] = train.SalePrice.values
k = anova(cat_data) 
k['disparity'] = np.log(1./k['pval'].values) 
sns.barplot(data=k, x = 'features', y='disparity') 
plt.xticks(rotation=90) 
plt 

### Drop NAs

In [None]:
df_002 = df_001.dropna(axis=0)

print(df_002.shape)
print(df_002.head())

### Drop Columns

In [None]:
df_003 = df_002.drop(['Codes','Lat_dm','Long_dm','Lat_dd','Long_dd','Wind_Speed'], axis = 1)

print(df_003.shape)
print(df_003.head())