# Correlation analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

The correlation coefficient measures the strength of the **linear** relationship between two variables. 

It is bounded between +1 and -1, with a correlation coefficient close to zero indicating no *linear* relationship between the two variables.

In [None]:
x = np.arange(5, 60, 10)
y = [38, 22, 32, 14, 20, 5]

In [None]:
fig, ax = plt.subplots()
ax.scatter(x, y)
ax.set_xlabel('x', fontsize = 16)
ax.set_ylabel('y', fontsize = 16)
plt.show()

The function `corrcoef` from numpy calculates the correlation coefficient between two sequences of values.

(Notice that the correlation between X and Y is the same as the correlation between Y and X).

In [None]:
print(np.corrcoef(x, y))

The function `corr` from pandas calculates the correlation coefficient for all numeric variables in a DataFrame.

In [None]:
values = {'Price'  : np.arange(5, 60, 10),
          'Demand' : [38, 22, 32, 14, 20, 5]}

df = pd.DataFrame(values)

df

In [None]:
fig, ax = plt.subplots()
ax.scatter(df['Price'], df['Demand'])
ax.set_xlabel('Price', fontsize = 16)
ax.set_ylabel('Demand', fontsize = 16)
plt.show()

In [None]:
df.corr()

In the last class exercise, we saw that there seemed to be a relationship between fuel economy (mpg) and car attributes such as horsepower and weight.

In [None]:
mpg = pd.read_csv('mpg.csv')

mpg.head()

In [None]:
fig, ax = plt.subplots(nrows = 1, ncols = 3, figsize = (13, 3))

# scatter plot horsepower
ax[0].scatter(mpg['horsepower'], mpg['mpg'])
ax[0].set_xlabel('horsepower')
ax[0].set_ylabel('mpg')

# scatter plot weight
ax[1].scatter(mpg['weight'], mpg['mpg'])
ax[1].set_xlabel('weight')

# scatter plot acceleration
ax[2].scatter(mpg['acceleration'], mpg['mpg'])
ax[2].set_xlabel('acceleration')

plt.show()

In [None]:
mpg.corr()

We can get specific correlations by using the .loc() function:

In [None]:
corr_horsepower = mpg.corr().loc['mpg', 'horsepower']
corr_weight = mpg.corr().loc['mpg', 'weight']
corr_acceleration = mpg.corr().loc['mpg', 'acceleration']

In [None]:
fig, ax = plt.subplots(nrows = 1, ncols = 3, figsize = (13, 3))

# scatter plot horsepower
ax[0].scatter(mpg['horsepower'], mpg['mpg'])
ax[0].set_xlabel('horsepower')
ax[0].set_ylabel('mpg')
ax[0].set_title(f'r = {round(corr_horsepower, 2)}')

# scatter plot weight
ax[1].scatter(mpg['weight'], mpg['mpg'])
ax[1].set_xlabel('weight')
ax[1].set_title(f'r = {round(corr_weight, 2)}')

# scatter plot acceleration
ax[2].scatter(mpg['acceleration'], mpg['mpg'])
ax[2].set_xlabel('acceleration')
ax[2].set_title(f'r = {round(corr_acceleration, 2)}')

plt.show()