In [1]:
import os
from pydataset import data
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math as m
from scipy import stats
from env import gdb
from env import Percent
from env import output_chi2_contingency
from sklearn.model_selection import train_test_split
import acquire
import prepare
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
import plotly.express as px

# Acquire and Prep - Wrangle

In the following lessons, we will walk through the data science pipeline using the following scenario:

I'm a university teacher, and I want to know when to worry about a student's progress.  I want to be able to work with any students who are at high risk of failing the class, so that I can try to prevent that from happening.  I have the grades of the three exams and the final grade from last semester's class.  I'm hoping I can build a prediction model that will be able to use these exams to predict the final grade within 5 points average per student.

In [None]:
# import warnings
# warnings.filterwarnings('ignore')

## Acquire the Data

Let's use pandas to read our csv into a pandas DataFrame.

In [2]:
# Read csv file into pandas DataFrame.

df = pd.read_csv('student_grades.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   student_id   104 non-null    int64  
 1   exam1        103 non-null    float64
 2   exam2        104 non-null    int64  
 3   exam3        104 non-null    object 
 4   final_grade  104 non-null    int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 4.2+ KB


### Sample and Summarize

Let's take a look at the DataFrame we brought in and document our initial findings.

In [3]:
df.exam1.value_counts(dropna=False)

70.0     16
100.0     8
98.0      8
85.0      8
83.0      8
93.0      8
79.0      8
92.0      8
73.0      8
58.0      8
57.0      8
62.0      7
NaN       1
Name: exam1, dtype: int64

In [5]:
df

Unnamed: 0,student_id,exam1,exam2,exam3,final_grade
0,1,100.0,90,95.0,96
1,2,98.0,93,96.0,95
2,3,85.0,83,87.0,87
3,4,83.0,80,86.0,85
4,5,93.0,90,96.0,97
5,6,79.0,70,85.0,81
6,7,92.0,89,94.0,93
7,8,73.0,70,75.0,76
8,9,70.0,65,78.0,77
9,10,,70,79.0,70


In [6]:
# 104 rows and 5 columns coming in.

df.size, df.shape

(520, (104, 5))

In [7]:
# Display readable summary statistics for numeric columns. Why isn't exam3 showing up?

df.describe()

Unnamed: 0,student_id,exam1,exam2,final_grade
count,104.0,103.0,104.0,104.0
mean,52.5,78.621359,77.307692,81.692308
std,30.166206,14.260955,10.295703,10.918122
min,1.0,57.0,65.0,65.0
25%,26.75,70.0,70.0,72.0
50%,52.5,79.0,75.0,81.0
75%,78.25,92.0,89.0,93.0
max,104.0,100.0,93.0,97.0


In [8]:
# Running .info() shows us that the exam3 column is not a numeric data type; it's an object.

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   student_id   104 non-null    int64  
 1   exam1        103 non-null    float64
 2   exam2        104 non-null    int64  
 3   exam3        104 non-null    object 
 4   final_grade  104 non-null    int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 4.2+ KB


#### Acquire and Summarize Takeaways

- Missing value in exam1.
- Exam data types are reading in differently. Why?
- Take a look at the exam3 column with the object data type.
- Only seem to be missing a small amount of data; we might just drop those observations.
- Why is exam1 reading in as a float; do we need that?
- Would we want to do anything with student_id?

___

## Prepare the Data

### Finding Null Values

Let's check out some other ways to find Null values when you are dealing with a larger dataframe, especially one with more attributes and more missing values.

- np.nan values have a float data type. When a column you expect to have an integer data type reads in as a float, this may be signaling that there is one or more Null values present.

#### `.isnull().sum()`

In [10]:
# Find the total number of Null values in each column of our DataFrame.

for i in df:
    print(df[i].isnull())

0      False
1      False
2      False
3      False
4      False
5      False
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
30     False
31     False
32     False
33     False
34     False
35     False
36     False
37     False
38     False
39     False
40     False
41     False
42     False
43     False
44     False
45     False
46     False
47     False
48     False
49     False
50     False
51     False
52     False
53     False
54     False
55     False
56     False
57     False
58     False
59     False
60     False
61     False
62     False
63     False
64     False
65     False
66     False
67     False
68     False
69     False
70     False
71     False
72     False
73     False
74     False
75     False
76     False

In [13]:
df.isnull().sum(axis=1)

0      0
1      0
2      0
3      0
4      0
5      0
6      0
7      0
8      0
9      1
10     0
11     0
12     0
13     0
14     0
15     0
16     0
17     0
18     0
19     0
20     0
21     0
22     0
23     0
24     0
25     0
26     0
27     0
28     0
29     0
30     0
31     0
32     0
33     0
34     0
35     0
36     0
37     0
38     0
39     0
40     0
41     0
42     0
43     0
44     0
45     0
46     0
47     0
48     0
49     0
50     0
51     0
52     0
53     0
54     0
55     0
56     0
57     0
58     0
59     0
60     0
61     0
62     0
63     0
64     0
65     0
66     0
67     0
68     0
69     0
70     0
71     0
72     0
73     0
74     0
75     0
76     0
77     0
78     0
79     0
80     0
81     0
82     0
83     0
84     0
85     0
86     0
87     0
88     0
89     0
90     0
91     0
92     0
93     0
94     0
95     0
96     0
97     0
98     0
99     0
100    0
101    0
102    0
103    0
dtype: int64

In [14]:
df.isnull().sum(axis=0)

student_id     0
exam1          1
exam2          0
exam3          0
final_grade    0
dtype: int64

#### `.isnull().any()`

In [15]:
# Check for any Null values in each column of our DataFrame.

df.isnull().any()

student_id     False
exam1           True
exam2          False
exam3          False
final_grade    False
dtype: bool

In [18]:
# Return the names for any columns in our DataFrame with any Null values.

df.columns[df.isnull().any()]

Index(['exam1'], dtype='object')

### Finding Odd Values

Let's find the odd value in `exam3` that is causing this numeric column to be coerced into an object data type.

In [21]:
# Check out the values and their frequencies from exam3 column.

df.exam3.describe()

count     104
unique     11
top        96
freq       16
Name: exam3, dtype: object

In [28]:
df.exam3.value_counts(ascending=True)

       1
95     8
87     8
86     8
85     8
94     8
79     8
70     8
75    15
96    16
78    16
Name: exam3, dtype: int64

In [None]:
# regex mini crash course
# ^  == Starting with
# $  == Ending with
# \s == any whitespace
# *  == zero or more times

In [29]:
# Replace a whitespace sequence or empty with a NaN 
# value and reassign this manipulation to df.

df = df.replace(r'^/s*$', np.nan, regex=True)

Verify that our empty string has been replaced by a null

In [30]:
# Now .info() shows us that exam3 has a Null value instead of a whitespace disguised as a non-null value.

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   student_id   104 non-null    int64  
 1   exam1        103 non-null    float64
 2   exam2        104 non-null    int64  
 3   exam3        104 non-null    object 
 4   final_grade  104 non-null    int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 4.2+ KB


### Drop Null Values

Let's drop observations that have any Null values; in this case, we have so few that we can simply drop rows instead of imputing values to save observations.

In [None]:
# Drop all rows with any Null values, assign to df, and verify.



### Convert Data Types

Let's convert any data types we need to at this point.

In [None]:
# Change all column data tyes to int64, reassign to df, and verify.



You may want to fill your missing values with a value instead of dropping the rows. One way to do this is to apply the `.fillna()` method to your dataframe. 
```python
# Default arguments for value and method parameters.

df.fillna(value=None, method=None)
```

When running `.describe()`, we should now see `exam3` listed since we have converted it to a numeric type.

___

## Visualize Distributions

We can plot histograms and/or boxplots to see the distributions of single variables and check for skewness, outliers, and unit scales. *Note, we don't have to split our data before exploring single variables. We DO have to split our data before performing bi- and multi-variate exploration.*

#### `sns.displot()`

We can use Seaborn's `displot` to display the binned values from a column.

In [None]:
# The default is bins=10.



#### `plt.subplot()` & `.hist()`

Here we'll loop through each of the numeric columns of interest and show the distribution of each on a separate subplot.

In [None]:
plt.figure(figsize=(16, 3))

# List of columns
cols = ['exam1', 'exam2', 'exam3', 'final_grade']

    
    # i starts at 0, but plot nos should start at 1
    
    # Create subplot.
    
    # Title with column name.
    
    # Display histogram for column.
    
    # Hide gridlines.


#### `sns.boxplot()`

Seaborn's `.boxplot` will default to plotting *all* the numeric variables if we don't specify specific x and y values. 

In [None]:
# We don't want to plot the `student_id` column.

plt.figure(figsize=(8,4))

# Create boxplots for all but student_id.


#### Distribution Takeaways

- Exam2 doesn't look to be super helpful in predicting final_grade.
- Exam3 has the highest median and Exam2 has the lowest median score.
- Exam1 has the largest range in scores.
- All numeric columns are bimodal distributions, no normal.

___

## Pipeline Function

We finalize these data wrangling steps (acquire and prepare) by writing a function that will reproduce the DataFrame with the necessary changes.

In [None]:
def wrangle_grades():
    '''
    Read student_grades csv file into a pandas DataFrame,
    drop student_id column, replace whitespaces with NaN values,
    drop any rows with Null values, convert all columns to int64,
    return cleaned student grades DataFrame.
    '''
    # Acquire data from csv file.
    
    # Replace white space values with NaN values.
    
    # Drop all rows with NaN values.
    
    # Convert all columns to int64 data types.
    


In [None]:
# Let's test out or wrangle function from above.



___