In [1]:
import numpy as np
from numpy.random import randn

import pandas as pd
from pandas import DataFrame, Series

import scipy
from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns

## Basic Nmupy Operations

In [10]:
np.set_printoptions(precision=2) #2 decimal places

In [6]:
print(f'{rand():.2f}')

0.57


In [None]:
randn(50)

array([-0.44, -0.24, -0.02, -0.57,  1.98,  0.13, -0.55,  0.05, -1.75,
        0.07,  0.17,  1.11,  0.31, -1.61, -1.85,  1.37,  0.58, -0.28,
        1.06, -0.38,  0.89,  1.49, -1.69, -0.45,  0.01,  2.07, -0.14,
        0.23, -0.28, -0.46, -0.6 , -1.73,  1.7 ,  0.31, -0.47,  0.09,
       -0.03, -0.79,  1.06, -0.58,  0.06,  0.15, -0.2 ,  0.02,  0.94,
        1.27, -0.44, -0.84,  0.51,  2.15])

In [16]:
d = np.arange(1,35)
d

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34])

In [19]:
c = np.arange(1,20)
c

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19])

In [17]:
d*10

array([ 10,  20,  30,  40,  50,  60,  70,  80,  90, 100, 110, 120, 130,
       140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260,
       270, 280, 290, 300, 310, 320, 330, 340])

In [18]:
d+2

array([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
       20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36])

# Numerical Data

**Descriptive statistics** provide a quantitative summary of a variable and the data points that comprise it.   
It can be used to get an understanding of a variable and the attributes that it represents. There are two categories of descriptive statistics.  
- Descriptive statistics that describe the *values of an observation* in a variable.[sum, median, mean, and max]
- Descriptive statistics that describe a *variable's spread*.  [standard deviation, variance, counts or quartiles]

Descriptive statistics can be used to:
- detect outliers
- plan data preparation requirements for machine learning
- select features for use in machine learning. 

In [2]:
data = DataFrame({
    'Name':[chr(i) for i in range(65, 120)],
    'Age': np.random.randint(15,35,55),
    'Height': np.random.randint(155,200,55),
    'Gender': [['Male', 'Female'][np.random.randint(2)] for i in range(55)],
    'Team': [['Red', 'Blue', 'Black', 'Yellow', 'Green'][np.random.randint(5)] for i in range(55)],
    'Score': np.random.randint(50,100,55)
})

data['Experience'] = data['Age'] - 15 + np.random.randint(0,5,55)

data.head()

Unnamed: 0,Name,Age,Height,Gender,Team,Score,Experience
0,A,26,168,Female,Blue,92,14
1,B,33,161,Male,Green,74,21
2,C,23,187,Male,Green,74,12
3,D,17,161,Female,Black,50,2
4,E,16,191,Female,Blue,90,1


## Summary Statistics

### Variable values

In [10]:
data.sum(numeric_only=True)

Age           1278
Height        9639
Score         4098
Experience     566
dtype: int64

In [11]:
data.sum(axis =1 , numeric_only=True)

0     291
1     276
2     302
3     233
4     329
5     306
6     295
7     309
8     286
9     298
10    268
11    281
12    315
13    271
14    304
15    289
16    282
17    305
18    292
19    268
20    300
21    295
22    249
23    269
24    266
25    237
26    286
27    255
28    306
29    281
30    304
31    308
32    282
33    282
34    312
35    275
36    285
37    280
38    246
39    248
40    264
41    281
42    232
43    273
44    326
45    274
46    293
47    290
48    267
49    291
50    285
51    276
52    298
53    308
54    257
dtype: int64

In [14]:
data.median(numeric_only=True)

Age            23.0
Height        174.0
Score          73.0
Experience     10.0
dtype: float64

In [15]:
data.mean(numeric_only=True)

Age            23.236364
Height        175.254545
Score          74.509091
Experience     10.290909
dtype: float64

In [16]:
data.max()

Name             w
Age             33
Height         199
Gender        Male
Score           99
Experience      21
dtype: object

In [18]:
data.idxmax(numeric_only=True)

Age           53
Height         6
Score          4
Experience    16
dtype: int64

In [21]:
data.loc[53,'Age']

np.int64(33)

### Variable Distribution

In [23]:
data.std(numeric_only=True)

Age            5.915966
Height        13.174355
Score         13.853563
Experience     6.205352
dtype: float64

In [24]:
data.var(numeric_only=True)

Age            34.998653
Height        173.563636
Score         191.921212
Experience     38.506397
dtype: float64

In [25]:
data['Gender'].value_counts()

Gender
Female    31
Male      24
Name: count, dtype: int64

In [22]:
data.describe()

Unnamed: 0,Age,Height,Score,Experience
count,55.0,55.0,55.0,55.0
mean,23.236364,175.254545,74.509091,10.290909
std,5.915966,13.174355,13.853563,6.205352
min,15.0,155.0,50.0,0.0
25%,17.0,166.0,63.5,5.0
50%,23.0,174.0,73.0,10.0
75%,28.0,186.5,86.5,16.0
max,33.0,199.0,99.0,21.0


# Categorical Data

There are three main ways to describe categorical variables:
- Counts
- Variable description
- Grouping

In [3]:
data.value_counts()

Name  Age  Height  Gender  Team    Score  Experience
A     26   168     Female  Blue    92     14            1
B     33   161     Male    Green   74     21            1
C     23   187     Male    Green   74     12            1
D     17   161     Female  Black   50     2             1
E     16   191     Female  Blue    90     1             1
F     33   162     Female  Blue    96     20            1
G     20   165     Male    Green   59     6             1
H     17   169     Male    Yellow  85     4             1
I     25   186     Male    Red     94     13            1
J     16   197     Male    Blue    75     5             1
K     15   182     Male    Red     72     1             1
L     28   159     Male    Yellow  71     16            1
M     34   173     Male    Yellow  98     19            1
N     16   158     Male    Red     78     4             1
O     25   164     Female  Yellow  86     14            1
P     34   187     Male    Yellow  63     21            1
Q     29   187     

In [4]:
data[['Gender','Team']].value_counts()

Gender  Team  
Male    Red       10
Female  Blue       7
Male    Yellow     7
Female  Red        7
Male    Green      6
Female  Yellow     5
        Black      4
        Green      3
Male    Black      3
        Blue       3
Name: count, dtype: int64

In [5]:
data['Gender'].value_counts()

Gender
Male      29
Female    26
Name: count, dtype: int64

In [6]:
data['Team'].value_counts()

Team
Red       17
Yellow    12
Blue      10
Green      9
Black      7
Name: count, dtype: int64

In [9]:
data_teams = data.groupby('Team')

In [10]:
data_teams.describe()

Unnamed: 0_level_0,Age,Age,Age,Age,Age,Age,Age,Age,Height,Height,...,Score,Score,Experience,Experience,Experience,Experience,Experience,Experience,Experience,Experience
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Team,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Black,7.0,22.142857,5.669467,15.0,17.5,23.0,26.5,29.0,7.0,177.714286,...,75.0,86.0,7.0,9.142857,7.12808,0.0,4.0,9.0,14.5,18.0
Blue,10.0,24.5,6.059886,16.0,19.5,25.5,28.5,33.0,10.0,172.9,...,92.75,97.0,10.0,11.5,6.276057,1.0,6.75,12.0,16.5,20.0
Green,9.0,26.222222,6.41829,18.0,20.0,26.0,33.0,33.0,9.0,177.0,...,89.0,99.0,9.0,13.222222,6.359595,5.0,6.0,13.0,19.0,21.0
Red,17.0,24.176471,5.626225,15.0,20.0,25.0,28.0,34.0,17.0,177.352941,...,81.0,99.0,17.0,11.176471,5.801242,1.0,6.0,13.0,15.0,20.0
Yellow,12.0,24.833333,6.464354,17.0,18.5,26.0,29.25,34.0,12.0,175.666667,...,94.5,99.0,12.0,12.166667,6.436167,3.0,6.75,13.5,17.5,21.0


In [13]:
pd.crosstab(data['Team'], data['Gender'])

Gender,Female,Male
Team,Unnamed: 1_level_1,Unnamed: 2_level_1
Black,4,3
Blue,7,3
Green,3,6
Red,7,10
Yellow,5,7


In [15]:
pd.crosstab(data['Team'], data['Age'])

Age,15,16,17,18,19,20,21,23,24,25,26,27,28,29,30,31,33,34
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Black,1,0,1,1,0,0,0,1,1,0,0,0,0,2,0,0,0,0
Blue,0,2,0,1,0,0,0,0,1,1,1,1,0,1,0,1,1,0
Green,0,0,0,1,1,1,0,1,0,0,1,0,0,0,0,1,3,0
Red,1,1,1,0,0,3,1,0,1,1,1,1,2,1,1,1,0,1
Yellow,0,0,3,0,1,0,1,0,0,1,0,1,1,1,1,0,0,2


In [16]:
pd.crosstab(data['Gender'], data['Age'])

Age,15,16,17,18,19,20,21,23,24,25,26,27,28,29,30,31,33,34
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Female,0,1,2,2,0,2,0,0,3,1,3,3,1,2,1,1,3,1
Male,2,2,3,1,2,2,2,2,0,2,0,0,2,3,1,2,1,2


In [54]:
aa = np.array([np.ones(5)*i for i in range(1,6)], dtype=int).flatten()
np.random.shuffle(aa)

pd.Series(aa, dtype=str).value_counts()

2    5
4    5
5    5
1    5
3    5
Name: count, dtype: int64

Pearson correlation analysis
Selecting transcript lines in this section will navigate to timestamp in the video
- [Instructor] Let's talk about parametric correlation analysis. Parametric correlation analysis is a method you can use to find correlation between linearly related continuous numeric variables. Don't worry if you don't exactly understand what that means because I'm going to show you how to figure this out in just a minute. First, I want to explain one important point about correlation. Correlation does not imply causation. Let me explain. Imagine you're a doctor studying regional obesity trends. You have two data sets: One on store size reported by zip code, and two on national obesity prevalence broken down by zip codes. In the course of your investigation, you apply the Pearson correlation method, that's the method I'm about to show you, and you find that there's a very strong positive correlation between grocery store size and obesity. The bigger the grocery stores, the more obesity there tends to be. Of course, the size of the store doesn't cause obesity, but they're correlated, and that correlation is quantifiable through the Pearson Method. Pearson correlation is measured by the correlation coefficient, R. If you have a Pearson R that's close to one, then that's a strong positive relationship. And if you had an R value that is close to negative one, then you've got a strong negative relationship. If you have an R value equal to zero, or close to it, then you're basically seeing that your variables are not linearly correlated. Now, the Pearson correlation assumes that your data is normally distributed, that you have continuous numeric variables, and that your variables are linearly related. A really important note that I wanted to add here is how do you use the Pearson correlation? So it's safe to use Pearson correlation to uncover linear relationships between variables, but you can not use it to rule out the possibility of non-linear relationships between variables. For this demonstration, we're going to be bringing in our standard libraries, pandas and numpy, but also please note that I've imported matplotlib and seaborn, as well as the rcParams. We're also going to be using scipy in this demonstration. So all of these are already loaded in our notebook, and I've also preloaded the empty cars data set that we've been working with, and set the plotting parameters for matplotlib. We covered all of these things in previous lectures, but the one thing I want to point out here is that we are importing Pearson R from the scipy stats package. So you have to first start off just by running these. And like I said, empty cars is ready to go. So all we need to do is run this to load it into our environment. Cool. So the column names are all set, and let's just go ahead and start by generating a pairplot using the seaborn library. To do that, we'll use the pairplot function, so that's sns.pairplot, and we'll pass in cars data frame and run this. Move it up a bit. It's just thinking for a little while. You can tell what Python's doing just by seeing this moving blue scroll at the top, and also this icon here saying how long it's taken for Python to process the request. Okay, wow. So we have a lot of data here that's been plotted out for us. And as you can see, if you were to count them up, we actually have 11 numeric variables in the cars data set. This basically takes up a lot of space. I went ahead and selected some variables for our analysis, and I'll go ahead and generate a scatter plot matrix of those in order to show you what about them is desirable for the Pearson correlation. And I'm going to take you over into another screen to explain really quickly. But before I do that, let's just make this second scatter plot. So we'll call it X, and we'll set X equal to our cars data frame, but we only want to select four columns, which are mpg, hp, qsec, and wt. Add the single quotes here. And then, again, we use the pairplot function, so that's sns.pairplot, pass in X, and run this. Okay, so that was a lot faster, and here we have a smaller pairplot. Now let me take you over to the other screen to explain what all this means. So let's consider the model assumptions for the Pearson correlation analysis. Pearson correlation assumes that your data is normally distributed, that variables are linearly related, and that the variables are continuous numeric variables. Let's look here at the normally distributed requirement. A normally distributed requirement is going to give a shape like a bell curve in a histogram. I wouldn't say that all these variables are exactly normally distributed, but they could possibly be close enough in order to generate some sort of correlation using the Pearson correlation method, so I'm going to go with these. Now, let's look at the requirement for a linear relationship. Do these variables have a linear relationship between them? In other words, does one increase while the other decreases? Based on the shape of the distribution of points between the variables, it looks like most of these have a distribution that could be at least close to linear, so I'm going to test them out with the Pearson correlation method. The last requirement is that the variables be continuous numeric variables. The best way for me to show you why I think that these are continuous numeric variables is to show you what a variable looks like when it's not a continuous numeric variable. If you look over at the scatterplot on the right, these variables over here are not continuous numeric variables. These are categorical variables because they can only assume a fixed number of positions, like we just discussed in the last section. So this variable can assume one of two values. That makes it a binomial variable. In the gear variable, it can assume three values; three, four, or five. That makes it a multinomial variable. These are not continuous numerical variables. When you see continuous numerical variables, the scatterplot of the variables is much more randomly and evenly distributed. The end conclusion here is that the variables that are shown on the right would not qualify for the Pearson R correlation analysis. Okay, great. So let's get back to our coding demonstration and use scipy to calculate Pearson correlation coefficients. Now, let's look at how to use scipy to calculate the Pearson correlation coefficient. Okay, so let's start by creating some variables we can use here. So we'll create an mpg variable, and we'll set that equal to cars.mpg. And then let's create an hp variable that's equal to cars hp. We'll create a qsec variable and we'll set that equal to cars qsec. And then, a wt variable, which will be directly from our cars data frame, the weight variable here. Okay. So let's start first by taking the Pearson R coefficient of the mpg and hp variable pair. So to do that, we're going to say pearsonr_coefficient and P value. We're going to set these equal to the Pearson R function, and we'll pass in our mpg and our hp. And then, let's print out the label. So we'll say print, and let our label be Pearson R correlation coefficient. Okay. And then, say %0.3f. And then, give another percentage sign, create a tuple here, and pass in our Pearson R coefficient object. Okay, I'm going to look this over really quick for any typos. Okay, yeah, so one issue is that I needed to close out the string here after the F, so I'm going to add a single quote and remove the single quote from there, and then we should be good to go. So let's run this. And then, what I'm going to do to calculate the Pearson R for the other variable pairs is just to copy this little chunk of code and paste it down here, and just change the variables out. Once we have the Pearson R coefficients, then we will discuss. So the second variable pair is going to be mpg and qsec. And the third variable pair will be mpg and weight. So let me run this. And this. Okay, great. So now we have our Pearson R values. Let's just look at what this means. Based on the Pearson correlation coefficient of these three variable pairs, the mpg weight variable pair appears to have the strongest linear correlation. The mpg qsec variable pair has a moderate degree of linear correlation. And you may be wondering, "Well, what do I do "with this information once I have it?" When you're doing machine learning, or other forms of advanced statistical analysis, these models often have assumptions that either the features are independent of one another or that they exhibit a degree of correlation, and you're going to see that later in this course. So you can use the Pearson R correlation coefficient to establish whether or not your variable pairs meet the requirements of more advanced models. Now, that you've seen the long form way of calculating the Pearson R value, let me show you some shortcuts. We will start by using pandas to calculate Pearson R correlation coefficient. So let me just notate that here. You can also generate some Pearson R statistics by using this corr method, so let's do it really quick. Using pandas, you can also generate Pearson R statistics by using the corr method. So let's do that real quick. We'll say that corr here is equal to x.corr, called the corr method. And then we'll print this out. And as you can see, it's really quickly generated all of the Pearson R values for each of the variable pairs in our smaller subset. The last way you can do this is using seaborn, and that would be with its heat map function. So we'll just say sns.heat map, and then we'll pass in our corr variable, and then we'll create some tick labels. So our xticklabels will be equal to the column values in our corr data frame. So we're going to say corr.column.values, and then our ytick labels will be equal to the columns in the corr data frame. So corr.columns.values, and then we run this. That looks nice, but what does it mean? Well, the darker shades of red indicate a strong degree of positive correlation, as you can see from the legend. Based on what we see, the hp weight variable pair has the highest degree of positive linear correlation. Judging by the darker hues in the grid, the mpg weight variable pair appears to have the strongest degree of negative linear correlation. You'll of course see here that when mpg is plotted against itself, then it has an absolute value of one. It correlates 100% with itself, that's why these are solid cream colors here. And then the sort of fuchsia color here, the weight qsec variable pair is not linearly correlated. Keep in mind, that doesn't mean there's no correlation between these variables whatsoever. In the next video, I'm going to show you some methods you can use to establish correlation between non-linearly related variables.