# Exploration

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr, spearmanr

import env
import wrangle

## Acquire and Prepare Data

We will use the function we created during our acquire/prep lesson:

In [None]:
def wrangle_grades():
    '''
    Read student_grades csv file into a pandas DataFrame,
    drop student_id column, replace whitespaces with NaN values,
    drop any rows with Null values, convert all columns to int64,
    return cleaned student grades DataFrame.
    '''
    # Acquire data from csv file.
    file = "https://gist.githubusercontent.com/ryanorsinger/14c8f919920e111f53c6d2c3a3af7e70/raw/07f6e8004fa171638d6d599cfbf0513f6f60b9e8/student_grades.csv"

    grades = pd.read_csv(file)

    # Replace white space values with NaN values.
    grades = grades.replace(r'^\s*$', np.nan, regex=True)

    # Drop all rows with NaN values.
    df = grades.dropna()

    # Convert all columns to int64 data types.
    df = df.astype('int')

    return df

In [None]:
df = wrangle_grades()

In [None]:
df.head()

Is `student_id` unique?

In [None]:
df.student_id.nunique()

In [None]:
df.student_id.shape

In [None]:
# Does each value unique identify a row?
df.student_id.nunique() == df.student_id.shape[0]

In [None]:
df = df.drop(columns='student_id')

In [None]:
train, test = train_test_split(df, random_state=123, train_size=.8)

In [None]:
train.shape, test.shape

In [None]:
plt.figure(figsize=(16, 3))

for i, col in enumerate(train.columns):
    
    # i starts at 0, but plot nos should start at 1
    plot_number = i + 1
    
    # Create subplot.
    # plt.subplot(row X col, where?)
    plt.subplot(1,4,plot_number)
    
    # Title with column name.
    plt.title(col)
    
    # Display histogram for column.
    train[col].hist(bins=5, edgecolor='black')
    
    # Hide gridlines.
    plt.grid(False)

## Why Explore?

- What is the purpose of this pipeline stage?

## Main Stages in Exploration
- Hypothesize
- Visualize
    - Plot out the distributions of each feature
        - Why?
    - Plot out the interaction of two or more variables?
        - Why?
    - Plot out how subgroups compare to each-other and to the overall population?
        - Why?
    - Document takeaways
        - Why?
    - Identify features that correlate with each other
        - Why?
- Test Hypotheses

---
## Goal

Let's keep our goal from our student grades scenario in mind here.
> I'm a university professor hoping I can build a prediction model that will be able to use these exams to predict the final grade within 5 points average per student.

## Initial Questions (i.e. Hypothesize Step)
- What is the relationship between individual exam scores and final grade? Exam scores to other exam scores?
- Is there a cutoff in grade that makes sense to investigate? Passing/failing/letter grades?
---

### **Q1:**  What is the relationship between individual exam scores and final grade? Exam scores to other exam scores?

### `sns.heatmap()`

Let's look at a heatmap of the correlation coefficients for a dataset. [Here](https://towardsdatascience.com/all-about-heatmaps-bb7d97f099d7) is an aritcle with lots of heatmap customization options.

- First, I need to calculate the correlation coefficient for each pair of variables.
- Pandas `.corr()` method allows me to quickly create a correlation matrix by computing pairwise correlation of columns. By default, `method=pearson`.
- I can change the `.corr()` argument to `method=spearman` if my variables are not normally distributed. Want to know more about the difference between pearson's r and spearman's rank? [This article](https://towardsdatascience.com/clearly-explained-pearson-v-s-spearman-correlation-coefficient-ada2f473b8) is short, sweet, and to the point.

In [None]:
# Create the correlation matrix for all exams.

exam_corr = train.corr(method='spearman')
exam_corr

Next, I pass my correlation matrix to Seaborn's `heatmap()` along with any customization I want to perform.

In [None]:
correlation_table = df.corr()
# sns.heatmap(correlation_table, cmap='Blues', annot=True, vmin=0, vmax=1)
sns.heatmap(correlation_table, cmap='Blues', annot=True)m

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(exam_corr, cmap='Purples', annot=True, linewidth=0.5, mask= np.triu(exam_corr))
plt.ylim(0, 4)

plt.show()

In [None]:
kwargs = {'alpha':.9,'linewidth':3, 'linestyle':'-', 
          'linecolor':'k','rasterized':False, 'edgecolor':'w', 
          'capstyle':'projecting',}

plt.figure(figsize=(8,6))
sns.heatmap(exam_corr, cmap='Purples', annot=True, mask= np.triu(exam_corr), **kwargs)

plt.show()

### Document Takeaways:

- Exam 1 seems to be the most predictive of final grade
- Everything seems to correlate with everything (Multicolinearity)

### Test Hypothesis(es)

In [None]:
# Use a scipy stats function pearsonr to calculate the correlation coefficient and the p-value.

r, p_value = pearsonr(train.exam1, train.final_grade)
print(f'Correlation Coefficient by Pearson Test: {r}, p-value: {p_value}')

# set alpha : 0.05
if p_value < 0.05:
    print('We can reject the null hypothesis')
else:
    print('we cannot reject the null hypothesis')

In [None]:
# Since my variables are not normally distributed, I should really choose Spearman instead.

r, p_value = spearmanr(train.exam1, train.final_grade)
print(f'Correlation Coefficient by Spearman Test: {r}, p-value: {p_value}')

# set alpha : 0.05
if p_value < 0.05:
    print('We can reject the null hypothesis')
else:m
    print('we cannot reject the null hypothesis')

### What other kinds of visualizations could we have made?

### `sns.relplot()`

In [None]:
sns.relplot(x='exam1', y='final_grade', data=train)
plt.show()

### `sns.lmplot()`

In [None]:
# I can really pop that line color if I want.
sns.lmplot(x='exam1', y='final_grade', data=train, line_kws={'color': 'red'})
plt.show()

### `sns.jointplot()`

In [None]:
sns.jointplot(x='exam1', y='final_grade', data=train, kind='reg')
plt.show()

### `sns.pairplot()`

In [None]:
# do we have a really large dataset? (long)
# try this: sns.pairplot(train.sample(anumbersmallerthanwhatwehave))

In [None]:
train.sample(25)

In [None]:
sns.pairplot(train, corner=True)
plt.savefig('somefile.png')

### Q2: Is there a cutoff in grade that makes sense to investigate? Passing/failing/letter grades?

In [None]:
train['exam1_desc'] = np.where(train.exam1 > 75, 'good grade', 'bad grade')
train['exam2_desc'] = np.where(train.exam2 > 75, 'good grade', 'bad grade')
train['exam3_desc'] = np.where(train.exam3 > 75, 'good grade', 'bad grade')

In [None]:
train

In [None]:
sns.histplot(data=train, x='final_grade', hue='exam1_desc', palette=['C0', 'C1'])

In [None]:
train.groupby('exam1_desc').final_grade.mean()

In [None]:
sns.histplot(data=train, x='final_grade', hue='exam2_desc', palette=['C0', 'C1'])

In [None]:
train.groupby('exam2_desc').final_grade.mean()

In [None]:
sns.histplot(data=train, x='final_grade', hue='exam3_desc', palette=['C1', 'C0'])

In [None]:
train.groupby('exam3_desc').final_grade.mean()

### Takeaways
- People who made a good grade on exam1 and exam2 tend to make good final grades.
- exam3 is a little messier in its relationship with final_grade
- Given the multicollinearity present, we may want to just focus on one exam. 
- exam1 had the highest linear correlation in the correlation plot, so we can work with that one

In [None]:
# Bonus visualization - Pairplots can also accept hue categories
sns.pairplot(train, hue='exam1_desc')

### Follow Up Question: How do people who eventually fail the class perform on exam1?

In [None]:
train['fails_class'] = train.final_grade < 70

In [None]:
pd.crosstab(train.exam1_desc, train.fails_class)

Interesting. Everyone who made a good grade on exam1 (above a 75) passed the course.

In [None]:
pd.crosstab(train.exam1_desc, train.fails_class, normalize='index')

Almost a third (32.5%) of students who get a bad grade (<=75) on the first exam end up failing the class.

We haven't even built a model yet and we already have an actionable recommendation. 