In [69]:
import pandas as pd
import numpy as np

1. [Exploratory Data Analysis (EDA)](#Exploratory_Data_Analysis)
2. [Principal Component Analysis (PCA)](#Principal_component_analysis)



<a id='Exploratory_Data_Analysis'></a>
#### Exploratory Data Analysis (EDA)

In [70]:
def nan_processor(df, replacement_str):
    """
    Take a DataFrame and return one where all occurrences
    of the replacement string have been replaced by `np.nan`
    and, subsequently, all rows containing np.nan
    have been removed.

    :param df: Input data frame (pandas.DataFrame)
    :param replacement_str: string to find and replace by np.nan
    :returns: DataFrame where the occurences of replacement_str have been
        replaced by np.nan and subsequently all rows containing np.nan have
        been removed
    """

    return df.replace(replacement_str, np.nan).dropna()

In [71]:
data_df = pd.DataFrame({'A': [0.5, 0.2, 0.1, 0.7], 'B': [0.3, 0.1, 'blah', 0.2], 'C': ['blah', 5, 6, 1]})
data_df.head()

Unnamed: 0,A,B,C
0,0.5,0.3,blah
1,0.2,0.1,5
2,0.1,blah,6
3,0.7,0.2,1


In [72]:
nan_processor(data_df, 'blah')

Unnamed: 0,A,B,C
1,0.2,0.1,5.0
3,0.7,0.2,1.0


In [73]:
def feature_cleaner(df, low, high):
    """
    Take a dataframe where columns are all numerical and non-constant.
    For each feature, mark the values that are not between the given
    percentiles (low-high) as np.nan. If a value is exactly on the high or low
    percentile, it should be marked as nan too.

    Then, remove all rows containing np.nan.
    Finally, the columns are scaled to have zero mean and unit variance
    (do this without sklearn).

    :param df:      Input DataFrame (with numerical columns)
    :param low:     Lowest percentile  (0.0<low<1.0)
    :param high:    Highest percentile (low<high<1.0)
    :returns:       Scaled DataFrame where elements that are outside of the
                    desired percentile range have been removed
    """

    def is_in_range(x):
        if x >= high_q or x <= low_q:
            return np.nan
        else:
            return x

    for col_df in df.columns:
        low_q = df[col_df].quantile(low)
        high_q = df[col_df].quantile(high)

        df[col_df] = df[col_df].apply(is_in_range)

    df = df.dropna()

    return (df - df.mean(axis=0)) / df.std(axis=0)

In [74]:
data2_df = pd.DataFrame({0: [0.1, 5, 0.2, 0.3, -0.1,-10], 1: [0.2,15,0.3,0.2,-0.2,0.4], 2: [25, 1, 0.5,0.01, -0.9,50]})
data2_df.head()

Unnamed: 0,0,1,2
0,0.1,0.2,25.0
1,5.0,15.0,1.0
2,0.2,0.3,0.5
3,0.3,0.2,0.01
4,-0.1,-0.2,-0.9


In [75]:
feature_cleaner(data2_df, 0.01, 0.99)

Unnamed: 0,0,1,2
0,-1.0,-0.57735,1.154531
2,-2.775558e-16,1.154701,-0.560119
3,1.0,-0.57735,-0.594412


In [76]:
def get_feature(df):
    """
    Take a dataframe where all columns are numerical (no NaNs) and not constant.
    One of the column named "CLASS" is either 0 or 1.

    Within each class, for each feature compute the ratio (R) of the
    range over the variance (the range is the gap between the smallest
    and largest value).

    For each column, compute the ratio (say K) of the larger R to the smaller R.
    Return the name of the column for which this last ratio K is largest.


    :param df:  Input DataFrame (with numerical columns)
    :returns:   Name of the column with largest K
    """

    # Calculate the index of the dataframe which are 1
    array_index_nonzero = df['CLASS'].to_numpy().nonzero()

    # Calculate the index of the dataframe which are 0
    # Unique values in array1 that are not in array2:
    # np.setdiff1d(array1, array2)
    array_index_zero = np.setdiff1d(df.index, array_index_nonzero)

    # Remove the column CLASS to not divide by zero
    df_without_class = df.drop('CLASS', axis=1)

    r_0 = ((np.max(df_without_class.values[array_index_zero], axis=0) - np.min(
        df_without_class.values[array_index_zero],
        axis=0)) / np.var(df_without_class.values[array_index_zero], axis=0))

    r_1 = ((np.max(df_without_class.values[array_index_nonzero], axis=0) - np.min(
        df_without_class.values[array_index_nonzero],
        axis=0)) / np.var(df_without_class.values[array_index_nonzero], axis=0))

    r_max = np.max([r_0, r_1], axis=0)
    r_min = np.min([r_0, r_1], axis=0)

    return df.columns[np.argmax(r_max / r_min)]

In [77]:
data3_df = pd.DataFrame({'A': [0.1, 5, 0.2, 0.3, -0.1,-10], 'B': [0.2,15,0.3,0.2,-0.2,0.4], 'C': [25, 1, 0.5,0.01, -0.9,50],
                        'CLASS':[0, 0, 1, 1, 0, 0]})
data3_df.head()

Unnamed: 0,A,B,C,CLASS
0,0.1,0.2,25.0,0
1,5.0,15.0,1.0,0
2,0.2,0.3,0.5,1
3,0.3,0.2,0.01,1
4,-0.1,-0.2,-0.9,0


In [78]:
get_feature(data3_df)

'B'

In [79]:
def one_hot_encode(label_to_encode, labels):
    """
    The function takes in a label to encode and a list of possible
    labels. It should return the label one-hot-encoded as a list of elements
    containing 0s and a unique 1 at the index corresponding to the matching
    label. 

    :param label_to_encode: the label to encode
    :param labels: a list of all possible labels
    :return: a list of 0s and one 1
    """

    return [1 if label_to_encode == value else 0 for value in labels]

In [80]:
## Test the function

print(one_hot_encode("pink", ["blue", "red", "pink", "yellow"]))
print(one_hot_encode("b", ["a", "b", "c", "d", "e"]))
print(one_hot_encode("f", ["a", "b", "c", "d", "e"]))

[0, 0, 1, 0]
[0, 1, 0, 0, 0]
[0, 0, 0, 0, 0]


<a id='Principal_component_analysis'></a>
#### Principal Component Analysis (PCA)



In [81]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [82]:
def get_cumulated_variance(df, scale):
    """Apply PCA on a DataFrame and return a new DataFrame containing
    the cumulated explained variance from with only the first component,
    up to using all components together. Values should be expressed as
    a percentage of the total variance explained.

    The DataFrame will have one row and each column should correspond to a
    principal component.


    If scale is True, it standardises the data first

    :param df: pandas DataFrame
    :param scale: boolean, whether to scale or not
    :return: a new DataFrame with cumulated variance in percent
    """

    if scale:
        df = StandardScaler().fit_transform(df)

    pca = PCA()
    pca.fit(df)

    cum_sum_pc = (np.cumsum(pca.explained_variance_ratio_) * 100).reshape(1, df.shape[1])
    columns_pc = ["PC" + str(i) for i in range(1, df.shape[1] + 1)]

    return pd.DataFrame(cum_sum_pc, columns=columns_pc)

In [83]:
df = pd.DataFrame({'A': [1.3, 27, 3.3, 9.3], 'B': [1.2, 2.1, 6.8, 3.2]})

get_cumulated_variance(df, True)

Unnamed: 0,PC1,PC2
0,64.818777,100.0


In [84]:
def get_coordinates_of_first_two(df, scale):
    """Apply PCA on a given DataFrame df and return a new DataFrame
    containing the coordinates of the first two principal components
    expressed in the original basis (with the original columns).

    If scale is True, it standardises the data first

    :param df: pandas DataFrame
    :param scale: boolean, whether to scale or not
    :return: a new DataFrame with coordinates of PC1 and PC2
    """

    if scale:
        df.loc[df.index, df.columns] = StandardScaler().fit_transform(df)

    pca = PCA(n_components=2)
    pca.fit(df)

    index_pc = ["PC" + str(i) for i in range(1, 3)]
    return pd.DataFrame(pca.components_, columns=df.columns, index=index_pc)


In [85]:
df = pd.DataFrame({'A': [1.3, 27, 3.3, 9.3], 'B': [1.2, 2.1, 6.8, 3.2]})

get_coordinates_of_first_two(df, True)

Unnamed: 0,A,B
PC1,-0.707107,0.707107
PC2,-0.707107,-0.707107


In [86]:
def get_most_important_two(df, scale):
    """Apply PCA on a given DataFrame df and use it to determine the
    'most important' features in your dataset. To do so we will focus
    on the principal component that exhibits the highest explained
    variance (that's PC1).

    PC1 can be expressed as a vector with weight on each of the original
    columns. Here we want to return the names of the two features that
    have the highest weights in PC1 (in absolute value).


    If scale is True, it standardises the data first


    :param df: pandas DataFrame
    :param scale: boolean, whether to scale or not
    :return: names of the two most important features as a tuple
    """

    if scale:
        df.loc[df.index, df.columns] = StandardScaler().fit_transform(df)

    pca = PCA(n_components=1)
    pca.fit(df)
    pca_components = pca.components_ ** 2
    max_1_pc_1 = np.argmax(pca_components)
    max_2_pc_1 = np.argwhere(pca_components == np.max(np.delete(pca_components, np.argmax(pca_components))))[0][1]

    return (df.columns[max_1_pc_1], df.columns[max_2_pc_1])

In [87]:
df_2 = pd.DataFrame({'A': [1.3, 27, 3.3, 9.3], 'B': [1.2, 2.1, 6.8, 3.2], 'C':[0.1, 1.2, 23, 4.5]})
get_most_important_two(df_2, True)

('C', 'B')

In [88]:
def distance_in_n_dimensions(df, point_a, point_b, n, scale):
    """The function applies PCA on a given DataFrame df in order to find
    a new subspace of dimension n.

    Transform the two points point_a and point_b to be represented into that
    n dimensions space, compute the Euclidean distance between the points in
    that space and return it.

    
    If scale is True, it standardises the data first


    :param df: pandas DataFrame
    :param point_a: a numpy vector expressed in the same basis as df
    :param point_b: a numpy vector expressed in the same basis as df
    :param n: number of dimensions of the new space
    :param scale: whether to scale data or not
    :return: distance between points in the subspace
    """
    new_coord = np.stack([point_a, point_b], axis=0)

    if scale:
        std_sclr = StandardScaler().fit(df)
        df.loc[df.index, df.columns] = std_sclr.transform(df)
        new_coord = std_sclr.transform(new_coord)

    pca = PCA(n_components=n)
    pca.fit(df)
    point_trans = pca.transform(new_coord)

    return np.linalg.norm(point_trans[0, :] - point_trans[1, :])


In [89]:
df_2 = pd.DataFrame({'A': [1.3, 2.0, 3.3, 5.1], 'B': [1.2, 2.1, 6.8, 3.2], 'C':[0.1, 1.2, 23.4, 4.5]})
point_a = np.array([1, 2, 3])
point_b = np.array([2, 3, 4])
n = 2
scale = False
distance_in_n_dimensions(df_2, point_a, point_b, n, scale)


1.6509894302597583

In [90]:
def find_outliers_pca(df, n, scale):
    """Apply PCA on a given DataFrame df and transform all the data to be expressed
    on the first principal component.

    With all those points in a one-dimension space, find outliers by looking for points
    that lie at more than n standard deviations from the mean.

    It returns a new dataframe containing all the rows of the original dataset
    that have been found to be outliers when projected.

   
    If scale is True, it standardises the data first

    :param df: pandas DataFrame
    :param n: number of standard deviations from the mean to be considered outlier
    :param scale: whether to scale data or not
    :return: pandas DataFrame containing outliers only
    """
    df_copy = df.copy()
    if scale:
        df.loc[df.index, df.columns] = StandardScaler().fit_transform(df)

    df_projected = PCA(n_components=1).fit_transform(df)

    std_df = np.std(df_projected)

    index_outliers = [i for i in range(len(df_projected)) if abs(df_projected[i]) > n * std_df]

    if index_outliers:
        return pd.DataFrame(df_copy.values[index_outliers], columns=df_copy.columns,
                            index=df_copy.index[index_outliers])
    else:
        return pd.DataFrame([], columns=df.columns)

In [91]:
df_2 = pd.DataFrame({'A': [1.3, 2.0, 3.3, 5.1], 'B': [1.2, 2.1, 6.8, 3.2], 'C':[0.1, 1.2, 23.4, 4.5]})
find_outliers_pca(df_2, 1, False)

Unnamed: 0,A,B,C
2,3.3,6.8,23.4
