# Worksheet for Correlation Analysis

In [None]:
import os.path
try:
    from urllib2 import urlopen
except ImportError:
    from urllib.request import urlopen
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm

### Download data and load into a dataframe

In [None]:
#### LOAD THE HELPER FUNCTIONS BELOW####
#### IMPORTANT: DONOT change these functions or your final submission will not evaluate correctly###

## This downloads your datafile, Do not change this function
def downloadFile(dataSetId):
    fileName = '%s.csv' % (dataSetId)
    url = 'https://s3.us-east-2.amazonaws.com/qq10-data/' + fileName
    print(url)

    response = urlopen(url)
    status = response.getcode()
    if status == 200:
      print('Downloading the dataset %s' % (fileName))
      with open(fileName, 'w') as f:
          f.write(response.read().decode('utf8'))
      return True
    else:
      logError('File not found. Please ensure you are working with correct data set Id')
      return False


In [None]:
filename = 'feature_data'
if not os.path.isfile('%s.csv'%filename):
    downloadFile('%s'%filename)
df_train = pd.read_csv('%s.csv'%filename)
df_train.columns

In [None]:
filename = 'target_variable_data'
if not os.path.isfile('%s.csv'%filename):
    downloadFile('%s'%filename)
y_train = pd.read_csv('%s.csv'%filename)
y_train.columns

#### View descriptive statistics of each dataset

In [None]:
df_train.describe().T

In [None]:
y_train.describe().T

#### Explore a single target variable

In [None]:
y_train['A1'].plot(kind="hist")

In [None]:
sns.distplot(y_train['A1'], fit = norm)

In [None]:
y_train['A1'].skew(), y_train['A1'].kurt()

#### Try transformations on the target variable

In [None]:
log_a1 = np.log1p(y_train['A1'])
sns.distplot(log_a1, fit = norm)

#### Other descriptive plots

In [None]:
y_train.hist(bins=50, figsize=(30,20));

#### Explore relationships with explanatory variables

In [None]:
var = 'Alpha_A1_1'
data = pd.concat([y_train['A1'], df_train[var]], axis=1)
data.plot.scatter(x=var, y='A1')

In [None]:
var = 'Beta_A_1'
data = pd.concat([y_train['A1'], df_train[var]], axis=1)
f, ax = plt.subplots(figsize=(16, 8))
fig = sns.boxplot(x=var, y='A1', data=data)
plt.xticks(rotation=90);

In [None]:
sns.set()
cols = ['Alpha_A1_1', 'Alpha_A1_2', 'Alpha_A1_3', 'Alpha_A1_4', 'Alpha_A1_5', 'Alpha_A1_6', 'Alpha_A1_7', 'Alpha_A1_8', 'Alpha_A1_9', 'Alpha_A1_10']
sns.pairplot(df_train.filter(regex='_A1'), size = 2.5)
plt.show();

#### Explore intercorrelations

In [None]:
corrmat = df_train.filter(regex='_A1').corr(method='spearman')

In [None]:
var = 'Alpha_A1_1'
f, ax = plt.subplots(figsize=(12, 10))
k = 25 #number of variables for heatmap
cols = corrmat.nlargest(k, var)[var].index
cm = np.corrcoef(df_train[cols].values.T)
sns.heatmap(cm, ax=ax, cmap="YlGnBu", linewidths=0.1, yticklabels=cols.values, xticklabels=cols.values)

In [None]:
cg = sns.clustermap(cm, cmap="YlGnBu", linewidths=0.1);
plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
cg

#### Explore correlations with target variable

In [None]:
tv = 'A1'
df_tv = df_train.filter(regex='_A1').join(y_train[tv])
corrmat = df_tv.corr(method='spearman')
f, ax = plt.subplots(figsize=(12, 10))

k = 50 #number of variables to explore
cols = corrmat.nlargest(k, tv)[tv].index
cm = np.corrcoef(df_tv[cols].values.T)
sns.heatmap(cm, ax=ax, cmap="YlGnBu", linewidths=0.1, yticklabels=cols.values, xticklabels=cols.values)

In [None]:
for i in range(k):
    temp_df = pd.DataFrame(df_tv[cols[i]], index = df_tv.index, columns=[cols[i], tv])
    temp_df[tv] = df_tv[tv]
    print(temp_df.corr(method='pearson'))
    plt.plot(temp_df[cols[i]], temp_df[tv], '.b')
    plt.show()