# Visual & Statistical Analytics In-class Exercise

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy.stats import ttest_ind

import warnings
warnings.filterwarnings("ignore")

The file n90pol.csv contains information on 90 university students who participated in a psychological experiment designed to look for relationships between the size of different regions of the brain and political views. The variables amygdala and acc indicate the volume of two particular brain regions known to be involved in emotions and decision-making, the amygdala and the anterior cingulate cortex; more exactly, these are residuals from the predicted volume, after adjusting for height, sex, and similar body-type variables. The variable orientation gives the students' political orientation on a five-point scale from 1 (very conservative) to 5 (very liberal).

In [None]:
# Run this before any other code cell
# This downloads the csv data files into the same directory where you have saved this notebook

import urllib.request
from pathlib import Path
import os
path = Path()

# Dictionary of file names and download links
files = {'n90pol.csv':'https://storage.googleapis.com/aipi_datasets/n90pol.csv'}

# Download each file
for key,value in files.items():
    filename = path/key
    url = value
    # If the file does not already exist in the directory, download it
    if not os.path.exists(filename):
        urllib.request.urlretrieve(url,filename)

In [None]:
student_data = pd.read_csv('n90pol.csv')
student_data.head()

### Explore features (amygdala and acc)

In [None]:
# View histogram of each feature amygdala and acc
fig,ax = plt.subplots(2,2,figsize=(15,10))
ax[0,0].hist(student_data['amygdala'],bins=5)
ax[0,0].set_title('amygdala, bins=5')
ax[0,1].hist(student_data['acc'],bins=5)
ax[0,1].set_title('acc, bins=5')
ax[1,0].hist(student_data['amygdala'],bins=20)
ax[1,0].set_title('amygdala, bins=20')
ax[1,1].hist(student_data['acc'],bins=20)
ax[1,1].set_title('acc, bins=20')
plt.show()

In [None]:
# View KDE plot of amygdala and acc
fig,ax = plt.subplots(1,2,figsize=(15,6))
for i,col in enumerate(['amygdala','acc']):
    sns.kdeplot(student_data[col],bw=None,ax=ax[i])
    ax[i].set_title(col)
plt.show()

In [None]:
# Show scatterplot and histograms for the two variables using seaborn
sns.jointplot(x='amygdala',y='acc',data=student_data, kind='scatter')
plt.show()

### Explore response (orientation)

In [None]:
# Bar chart of counts for each orientation
student_data['orientation'].value_counts().sort_index().plot(kind='bar')
plt.show()

### Explore relationship between features and response

In [None]:
# Plot conditional distribution of volume of amygdala conditioned on orientation
# Use the auto-set bandwidth
fig,ax = plt.subplots(1,2,figsize=(15,6))
for i,col in enumerate(['amygdala','acc']):
    sns.kdeplot(student_data.loc[student_data.orientation==1,col],ax=ax[i])
    sns.kdeplot(student_data.loc[student_data.orientation==2,col],ax=ax[i])
    sns.kdeplot(student_data.loc[student_data.orientation==3,col],ax=ax[i])
    sns.kdeplot(student_data.loc[student_data.orientation==4,col],ax=ax[i])
    sns.kdeplot(student_data.loc[student_data.orientation==5,col],ax=ax[i])
    ax[i].legend(labels=['Orientation 2','Orientation 3','Orientation 4','Orientation 5'])
    ax[i].set_title('pdf of {}'.format(col))
plt.show()

## Part 1
Create boxplots of amygdala and acc, grouped by orientation.  Observe the difference in distributions of amygdala and acc for each orientation.

In [None]:
### BEGIN SOLUTION ###


### END SOLUTION ###

## Part 2
For both amygdala and acc, perform an ANOVA analysis on each to see if there is a statistically significant difference in the means of any of the orientations at an alpha = 0.05.

In [None]:
### BEGIN SOLUTION ###

### END SOLUTION ###

## Part 3
Let's now determine whether there is a statistically significant difference in the mean amygdala and acc values between orientations 2 and 5.  Perform a t-test to determine if any difference in mean values is significant at an alpha = 0.05

In [None]:
### BEGIN SOLUTION ###

### END SOLUTION ###