# Lesson 1

## Library loading

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Feature selection based on variance.

Here we will use variance threshold to remove some features that very low variance, ie those columns have almost the same values in all the rows. Hence such features do not explain anything about the target variable 

In [2]:
numerical = pd.read_csv('numerical.csv')
categorical = pd.read_csv('categorical.csv')
targets = pd.read_csv('target.csv')

In [3]:
from sklearn.feature_selection import VarianceThreshold

Remember the definition of the variance:

$\sigma_{col}^{2}=\frac{1}{n}\sum_{i=1}^{n}(x_{i}-\bar{X})^2$

In [4]:
sel = VarianceThreshold(threshold=(.9))
temp = sel.fit_transform(numerical)
temp = pd.DataFrame(temp)
print(numerical.shape)
print(temp.shape)

(95412, 315)
(95412, 305)


As it can be clearly seen, the VarianceThreshold method reduces the size of the dataframe droping those columns for which the variance is smaller than 0.9.

To check which columns were removed we can do:

In [5]:
list(sel.get_support())

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True

In [6]:
list(sel.variances_)

[909809.7483785483,
 208.8158971989076,
 2.8881932699556185,
 7.320489004283561,
 86.61745725511418,
 25.65924724942382,
 131.57223253672964,
 227.9397116710221,
 313.6076948242227,
 19.286818754804564,
 26.281939824591404,
 17.265241830723728,
 8.311701637927996,
 32984198.86101045,
 2126043.3424667185,
 4507490.409069261,
 2249.6547195922,
 974.8079246455828,
 1603.0129942634196,
 30.97534865272938,
 33.27364006327841,
 441.57831545183956,
 278.5789169921517,
 12.03604563865652,
 49.980973211488966,
 190.05348704784367,
 0.4614096996326538,
 4.9825142559714894,
 6.419788645235147,
 5.332371515424055,
 1.007061136251383,
 1.1111593939063544,
 1.368116911416873,
 128.44765349637763,
 1.6713672732722233,
 10.295326351943883,
 11.32502530691183,
 69.48179133914489,
 68.05635235184752,
 65.75686276277202,
 52.72468559024539,
 48.441176213956375,
 47.3710295874623,
 56.47911443932282,
 67.7692564686937,
 40.71093052337281,
 33.95626692614386,
 36.82299864542428,
 60.64881975087444,
 38.576

In [7]:
my_list_of_tuples = [(index,value) for (index,value) in enumerate(list(sel.variances_)) if value < 0.9]

In [8]:
cols_nul_variance =  [list(numerical.columns)[index] for index in [x[0] for x in my_list_of_tuples]]
cols_nul_variance

['ETH6',
 'TPE6',
 'TPE7',
 'ANC5',
 'ANC6',
 'ANC11',
 'ANC15',
 'HC15',
 'MHUC2',
 'HPHONE_D']

## Activity 1

Fill the missing code:


In [9]:
X = numerical
y = targets['TARGET_B']

In [10]:
targets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95412 entries, 0 to 95411
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   TARGET_B  95412 non-null  int64  
 1   TARGET_D  95412 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 1.5 MB


In [11]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
kbest = SelectKBest(chi2, k=10).fit_transform(X, y)
# Here we chose 10 so that is easier to analyze results later, as we will see
selected = pd.DataFrame(kbest)
selected.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,992.0,264.0,479.0,635.0,0.0,318.0,12883.0,240.0,95515.0
1,1.0,3611.0,940.0,5468.0,5218.0,4480.0,1096.0,36175.0,47.0,148535.0
2,1.0,7001.0,2040.0,497.0,546.0,0.0,292.0,11576.0,202.0,15078.0
3,0.0,640.0,160.0,1000.0,1263.0,9340.0,388.0,15130.0,109.0,172556.0
4,0.0,2520.0,627.0,576.0,594.0,5000.0,250.0,9836.0,254.0,7112.0


In [12]:
# To check the scores
model = SelectKBest(chi2, k=10).fit(X, y)
df =pd.DataFrame(data = model.scores_, columns = ['score'])
df['Column'] = numerical.columns
# Sorting data
print(df.sort_values(by = ['score'], ascending = False).head(10))
print()

# Just to check the columns, we can use the following code
cols = df.sort_values(by = ['score'], ascending = False).head(10)['Column']
print(cols)

             score    Column
311  527716.426176  CONTROLN
140  187983.976667       IC5
83    49855.611718       HV1
84    49561.067003       HV2
0     39087.069814     TCODE
133   26891.429352       MSA
13    17167.230879    POP901
137    2921.367106       IC2
14     2811.233301    POP902
303    2756.199364  RAMNTALL

311    CONTROLN
140         IC5
83          HV1
84          HV2
0         TCODE
133         MSA
13       POP901
137         IC2
14       POP902
303    RAMNTALL
Name: Column, dtype: object


# Lesson 2

## Recursive feature elimination

In [13]:
from sklearn.feature_selection import RFE
from sklearn import linear_model

# We can use any model, LM, K-NN,...
# For classification problems, the metric used to compare models is Accuracy
# For regression problems, the metric used to compare models is RMSE
lm = linear_model.LinearRegression()
rfe = RFE(lm, n_features_to_select=20, verbose=False)
rfe.fit(X, y)

RFE(estimator=LinearRegression(), n_features_to_select=20, verbose=False)

In [14]:
# After we run the algorithm, it labels the top features as 1 and the rest are marked in an increasing order of importance. 
df = pd.DataFrame(data = rfe.ranking_, columns=['Rank'])
df['Column_name'] = X.columns
df[df['Rank']==1]

Unnamed: 0,Rank,Column_name
2,1,INCOME
16,1,POP90C1
17,1,POP90C2
18,1,POP90C3
30,1,ETH10
76,1,DW3
77,1,DW4
78,1,DW5
121,1,RHP4
122,1,HUPA1


## Activity 2

While both methods are used for reducing the number of features in a dataset, there is an important difference.
Feature selection is simply selecting and excluding given features without changing them.
Dimensionality reduction transforms features into a lower dimension.

# Lesson 3

# Lesson 4

## Hypothesis testing

We want to test if our the **sample mean** is not equal to the **population mean** = 80.94. We also know that our **sample** has a size of 25 individuals.

$t = \frac{(\bar{X}-\mu)}{\hat{\sigma}/\sqrt{n}}$

where:

* $\bar{X}$ is the **sample mean**
* $\mu$ is the **population mean**
* $\hat{\sigma}$ is the **sample standard deviation**
* $n$ is the number of measures in our sample

In [15]:
import math

sample_mean = 80.94
pop_mean = 85
sample_std = 11.6
n = 25
statistic = (sample_mean - pop_mean)/(sample_std/math.sqrt(n))
print("Statistic is: ", statistic)

Statistic is:  -1.750000000000001


In [16]:
from scipy import stats
from numpy.random import normal


samples = {}

for i in range(10):
    sample_name = "sample_" + str(i)
    samples[sample_name] = normal(loc = 80.94, scale = 11.6, size = 25)
    sample_mean = "sample_" + str(i) + "_mean"
    samples[sample_mean] = np.mean(samples[sample_name])
    sample_std = "sample_" + str(i) + "_std"
    samples[sample_std] = np.std(samples[sample_name],ddof=1)
    sample_statistic = "sample_" + str(i) + "_t-statistic"
    samples[sample_statistic] = (samples[sample_mean]- pop_mean)/(samples[sample_std]/math.sqrt(n)) 
    print("The t-statistic for the sample {} is: {}".format(i,samples[sample_statistic]))


The t-statistic for the sample 0 is: -3.4284294948596377
The t-statistic for the sample 1 is: -1.509158531895055
The t-statistic for the sample 2 is: -2.8013761371641563
The t-statistic for the sample 3 is: -1.172859692101228
The t-statistic for the sample 4 is: -2.8859289922906903
The t-statistic for the sample 5 is: -3.4991171944791604
The t-statistic for the sample 6 is: -0.07494189415869
The t-statistic for the sample 7 is: -1.5597905329045525
The t-statistic for the sample 8 is: -3.1097582238639383
The t-statistic for the sample 9 is: -3.398390852679428


Now that we have the t-statistic for each random sample, let's make the two tails test. Why two tails? Because we are looking what is the probability that we get a **sample mean** which deviates from the **population mean** more than out t-statistic. We don't care if the our **sample mean** is bigger or smaller than the **population mean**.

Therefore, we can ask ourselves what is the probability of having a deviation within -t and t.


In [17]:
print("Assuming a significance level of 0.05")
print()

for i in range(10):
    sample_name = "sample_" + str(i)
    print("The p-value of sample {} is: {:-5.3}".format(i,stats.ttest_1samp(samples[sample_name],85)[1]))
    if ( stats.ttest_1samp(samples[sample_name],85)[1] < 0.05 ):
        print("Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample {} given Ho.".format(i))
    print()

Assuming a significance level of 0.05

The p-value of sample 0 is: 0.0022
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 0 given Ho.

The p-value of sample 1 is: 0.144

The p-value of sample 2 is: 0.0099
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 2 given Ho.

The p-value of sample 3 is: 0.252

The p-value of sample 4 is: 0.00812
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 4 given Ho.

The p-value of sample 5 is: 0.00185
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 5 given Ho.

The p-value of sample 6 is: 0.941

The p-value of sample 7 is: 0.132

The p-value of sample 8 is: 0.00477
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 8 given Ho.

The p-value of sample 9 is: 0.00237
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 9 given Ho.



### Lab | Inferential statistics

In [18]:
import math

sample_mean = 130.1
pop_mean = 120
sample_std = 21.21
n = 100
statistic = (sample_mean - pop_mean)/(sample_std/math.sqrt(n))
print("Statistic is: ", statistic)

Statistic is:  4.761904761904759


In [19]:
from scipy import stats
from numpy.random import normal


samples = {}

for i in range(10):
    sample_name = "sample_" + str(i)
    samples[sample_name] = normal(loc = 80.94, scale = 11.6, size = 25)
    sample_mean = "sample_" + str(i) + "_mean"
    samples[sample_mean] = np.mean(samples[sample_name])
    sample_std = "sample_" + str(i) + "_std"
    samples[sample_std] = np.std(samples[sample_name],ddof=1)
    sample_statistic = "sample_" + str(i) + "_t-statistic"
    samples[sample_statistic] = (samples[sample_mean]- pop_mean)/(samples[sample_std]/math.sqrt(n)) 
    print("The t-statistic for the sample {} is: {}".format(i,samples[sample_statistic]))

The t-statistic for the sample 0 is: -31.88633545826595
The t-statistic for the sample 1 is: -29.558989440375488
The t-statistic for the sample 2 is: -43.63272360454621
The t-statistic for the sample 3 is: -30.752294028708775
The t-statistic for the sample 4 is: -40.54306209979111
The t-statistic for the sample 5 is: -37.08645557710017
The t-statistic for the sample 6 is: -39.01222132174635
The t-statistic for the sample 7 is: -36.60695059651024
The t-statistic for the sample 8 is: -39.113317241056954
The t-statistic for the sample 9 is: -40.19640223285329


In [20]:
print("Assuming a significance level of 0.05")
print()

for i in range(10):
    sample_name = "sample_" + str(i)
    print("The p-value of sample {} is: {:-5.3}".format(i,stats.ttest_1samp(samples[sample_name],85)[1]))
    if ( stats.ttest_1samp(samples[sample_name],85)[1] < 0.05 ):
        print("Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample {} given Ho.".format(i))
    print()

Assuming a significance level of 0.05

The p-value of sample 0 is: 0.539

The p-value of sample 1 is: 0.0609

The p-value of sample 2 is: 0.00802
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 2 given Ho.

The p-value of sample 3 is: 0.0124
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 3 given Ho.

The p-value of sample 4 is: 0.0197
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 4 given Ho.

The p-value of sample 5 is: 0.0255
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 5 given Ho.

The p-value of sample 6 is: 0.00116
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 6 given Ho.

The p-value of sample 7 is: 0.341

The p-value of sample 8 is: 0.0227
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 8 given Ho.

The p-value of sample 9 is: 0.0249
Therefore we discard the null hypothesis Ho,