In [2]:
import numpy as np
from numpy.random import randn

import pandas as pd
from pandas import DataFrame, Series

import scipy
from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns

## Basic Nmupy Operations

In [10]:
np.set_printoptions(precision=2) #2 decimal places

In [6]:
print(f'{rand():.2f}')

0.57


In [None]:
randn(50)

array([-0.44, -0.24, -0.02, -0.57,  1.98,  0.13, -0.55,  0.05, -1.75,
        0.07,  0.17,  1.11,  0.31, -1.61, -1.85,  1.37,  0.58, -0.28,
        1.06, -0.38,  0.89,  1.49, -1.69, -0.45,  0.01,  2.07, -0.14,
        0.23, -0.28, -0.46, -0.6 , -1.73,  1.7 ,  0.31, -0.47,  0.09,
       -0.03, -0.79,  1.06, -0.58,  0.06,  0.15, -0.2 ,  0.02,  0.94,
        1.27, -0.44, -0.84,  0.51,  2.15])

In [16]:
d = np.arange(1,35)
d

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34])

In [19]:
c = np.arange(1,20)
c

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19])

In [17]:
d*10

array([ 10,  20,  30,  40,  50,  60,  70,  80,  90, 100, 110, 120, 130,
       140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260,
       270, 280, 290, 300, 310, 320, 330, 340])

In [18]:
d+2

array([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
       20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36])

# Numerical Data

**Descriptive statistics** provide a quantitative summary of a variable and the data points that comprise it.   
It can be used to get an understanding of a variable and the attributes that it represents. There are two categories of descriptive statistics.  
- Descriptive statistics that describe the *values of an observation* in a variable.[sum, median, mean, and max]
- Descriptive statistics that describe a *variable's spread*.  [standard deviation, variance, counts or quartiles]

Descriptive statistics can be used to:
- detect outliers
- plan data preparation requirements for machine learning
- select features for use in machine learning. 

## Summary Statistics

In [27]:
data = DataFrame({
    'Name':[chr(i) for i in range(65, 120)],
    'Age': np.random.randint(15,35,55),
    'Height': np.random.randint(155,200,55),
    'Gender': [['Male', 'Female'][np.random.randint(2)] for i in range(55)],
    'Team': [['Red', 'Blue', 'Black', 'Yellow', 'Green'][np.random.randint(5)] for i in range(55)],
    'Score': np.random.randint(50,100,55)
})

data['Experience'] = data['Age'] - 15 + np.random.randint(0,5,55)

data.head()

Unnamed: 0,Name,Age,Height,Gender,Team,Score,Experience
0,A,15,160,Female,Red,63,2
1,B,17,194,Female,Black,63,4
2,C,26,175,Male,Yellow,80,13
3,D,16,156,Male,Red,97,3
4,E,28,184,Male,Red,73,16


### Variable values

In [10]:
data.sum(numeric_only=True)

Age           1278
Height        9639
Score         4098
Experience     566
dtype: int64

In [11]:
data.sum(axis =1 , numeric_only=True)

0     291
1     276
2     302
3     233
4     329
5     306
6     295
7     309
8     286
9     298
10    268
11    281
12    315
13    271
14    304
15    289
16    282
17    305
18    292
19    268
20    300
21    295
22    249
23    269
24    266
25    237
26    286
27    255
28    306
29    281
30    304
31    308
32    282
33    282
34    312
35    275
36    285
37    280
38    246
39    248
40    264
41    281
42    232
43    273
44    326
45    274
46    293
47    290
48    267
49    291
50    285
51    276
52    298
53    308
54    257
dtype: int64

In [14]:
data.median(numeric_only=True)

Age            23.0
Height        174.0
Score          73.0
Experience     10.0
dtype: float64

In [15]:
data.mean(numeric_only=True)

Age            23.236364
Height        175.254545
Score          74.509091
Experience     10.290909
dtype: float64

In [16]:
data.max()

Name             w
Age             33
Height         199
Gender        Male
Score           99
Experience      21
dtype: object

In [18]:
data.idxmax(numeric_only=True)

Age           53
Height         6
Score          4
Experience    16
dtype: int64

In [21]:
data.loc[53,'Age']

np.int64(33)

### Variable Distribution

In [23]:
data.std(numeric_only=True)

Age            5.915966
Height        13.174355
Score         13.853563
Experience     6.205352
dtype: float64

In [24]:
data.var(numeric_only=True)

Age            34.998653
Height        173.563636
Score         191.921212
Experience     38.506397
dtype: float64

In [25]:
data['Gender'].value_counts()

Gender
Female    31
Male      24
Name: count, dtype: int64

In [22]:
data.describe()

Unnamed: 0,Age,Height,Score,Experience
count,55.0,55.0,55.0,55.0
mean,23.236364,175.254545,74.509091,10.290909
std,5.915966,13.174355,13.853563,6.205352
min,15.0,155.0,50.0,0.0
25%,17.0,166.0,63.5,5.0
50%,23.0,174.0,73.0,10.0
75%,28.0,186.5,86.5,16.0
max,33.0,199.0,99.0,21.0


# Categorical Data

There are three main ways to describe categorical variables:
- Counts
- Variable description
- Grouping