## Coding Exercise #0207

### 1. Statistics with Pandas:

In [None]:
import pandas as pd
import numpy as np
import os

#### 1.1. Read data into a DataFrame:

In [None]:
!wget --no-clobber https://raw.githubusercontent.com/tn-220/SIC-Machine-Learning/main/SIC_ML_Coding_Exercises/SIC_ML_Chapter_03_Coding_Exercises/data_iris.csv

In [None]:
df = pd.read_csv('data_iris.csv', header='infer')

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.head(3)

In [None]:
df.tail(3)

#### 1.2. Statistics using DataFrame methods:

In [None]:
# Column averages.
df.mean(axis=0)

In [None]:
# Descriptive statistics.
df.describe()

In [None]:
# Correlation.
df.loc[:,'Sepal.Length'].corr(df.loc[:,'Sepal.Width'])

In [None]:
# Correlation.
df.loc[:,'Sepal.Length'].corr(df.loc[:,'Petal.Length'])

In [None]:
# Correlation matrix.
np.round(df.corr(),3)

In [None]:
# Correlation with respect to one variable (column).
df.corrwith(df.loc[:,'Petal.Length'])

#### 1.3. Slice and then apply a statistic method:

In [None]:
df[df.Species=='setosa'].mean(axis=0)

In [None]:
df[df.Species=='virginica'].mean(axis=0)

In [None]:
df[df.Species=='versicolor'].mean(axis=0)

Frequency table:

In [None]:
df.Species.value_counts()

### 2. Missing values:

First of all, we purposefully enter the missing values:

In [None]:
df.iloc[10,0] = np.nan
df.iloc[2,2] = np.nan
df.iloc[17,2] = np.nan
df.iloc[2,3] = np.nan
df.iloc[129,2] = np.nan

In [None]:
df.head(3)

#### 2.1. Statistics of the missing values:

In [None]:
# Count non-missing values per column.
df.count(axis=0)

In [None]:
# Calculate the averages skipping the missing values.
df.drop(columns='Species').mean(axis=0)                                 # 'Species' column is left out.

In [None]:
# Try to calculate the averages without skipping the missing values.  => NaNs appear!
df.drop(columns='Species').mean(axis=0, skipna=False)                  # 'Species' column is left out.

#### 2.2. Detect and process the missing values:

In [None]:
# Count the missing values per column.
(df.isnull()).sum(axis=0)

In [None]:
# Proportion of missing values per column.
(df.isnull()).mean(axis=0)

In [None]:
# Drop the rows with at leat one missing value.
df2 = df.dropna(axis=0)

In [None]:
df2.shape

In [None]:
df2.head(3)

In [None]:
# Drop the rows with less than 4 normal values.
df2 = df.dropna(axis=0, thresh = 4)
df2.shape

In [None]:
# Drop the columns with at least one missing value.
df3 = df.dropna(axis=1)
df3.shape

In [None]:
df3.head(3)

In [None]:
# Drop the columns with less than 149 normal values.
df3 = df.dropna(axis=1, thresh = 149)
df3.shape

In [None]:
df3.head(3)

#### 2.3. Fill the missing values:

In [None]:
# Fill the missing values with 0.
df4 = df.fillna(value=0)
df4.count()                         # axis=0 by default.

In [None]:
# Fill the missing values using column averages.
df['Sepal.Length'] = df['Sepal.Length'].fillna(value=df['Sepal.Length'].mean())
df['Petal.Length'] = df['Petal.Length'].fillna(value=df['Petal.Length'].mean())
df['Petal.Width'] = df['Petal.Width'].fillna(value=df['Petal.Width'].mean())
df.count()                          # axis=0 by default.

In [None]:
df.head(3)