### Import libraries

In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import kmapper as km
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

### Read Data

In [89]:
datos_b1 =  pd.read_csv("señales_b1_100.csv")
datos_b1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,231,232,233,234,235,236,237,238,239,tipo_señal
0,0.699886,0.673249,0.633735,0.607163,0.581062,0.44021,0.42723,0.384592,0.379101,0.370899,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.677619,0.671661,0.660272,0.617593,0.580183,0.510488,0.464481,0.43789,0.415127,0.396168,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.389672,1.299218,1.068889,0.966674,0.856888,0.816201,0.715938,0.705629,0.676231,0.620521,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.602958,0.506011,0.476734,0.424379,0.412444,0.410826,0.408294,0.393165,0.392688,0.384632,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.57869,0.522158,0.4282,0.407731,0.39946,0.361513,0.333549,0.326281,0.325906,0.314948,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### Descritive Analysis

In [90]:
datos_b1["tipo_señal"].value_counts()

1.0    57
0.0    43
Name: tipo_señal, dtype: int64

### Mapper

In [91]:
y = datos_b1["tipo_señal"]
X = datos_b1.drop("tipo_señal",axis=1).iloc[:,:5]
X

Unnamed: 0,0,1,2,3,4
0,0.699886,0.673249,0.633735,0.607163,0.581062
1,0.677619,0.671661,0.660272,0.617593,0.580183
2,1.389672,1.299218,1.068889,0.966674,0.856888
3,0.602958,0.506011,0.476734,0.424379,0.412444
4,0.578690,0.522158,0.428200,0.407731,0.399460
...,...,...,...,...,...
95,0.441519,0.402134,0.395426,0.385906,0.379282
96,0.571815,0.485590,0.469890,0.456732,0.422064
97,0.550898,0.519087,0.462780,0.435269,0.432664
98,0.533210,0.488191,0.476843,0.441431,0.437161


In [92]:
promedios = []
for i in range(X.shape[0]):
    promedios.append(X.loc[i].mean())

X["promedio"] = promedios
X

Unnamed: 0,0,1,2,3,4,promedio
0,0.699886,0.673249,0.633735,0.607163,0.581062,0.639019
1,0.677619,0.671661,0.660272,0.617593,0.580183,0.641466
2,1.389672,1.299218,1.068889,0.966674,0.856888,1.116268
3,0.602958,0.506011,0.476734,0.424379,0.412444,0.484505
4,0.578690,0.522158,0.428200,0.407731,0.399460,0.467248
...,...,...,...,...,...,...
95,0.441519,0.402134,0.395426,0.385906,0.379282,0.400853
96,0.571815,0.485590,0.469890,0.456732,0.422064,0.481218
97,0.550898,0.519087,0.462780,0.435269,0.432664,0.480140
98,0.533210,0.488191,0.476843,0.441431,0.437161,0.475367


In [93]:
#Normalizar datos, para que una variable no tenga mas peso que otras por cuestion de unidades
scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(X)

In [94]:
features_scaled[:5]

array([[0.18235304, 0.2941293 , 0.27344117, 0.28959644, 0.27696116,
        0.24445382],
       [0.16834964, 0.29260447, 0.29997348, 0.30004172, 0.27606397,
        0.24667523],
       [0.61616239, 0.89550125, 0.70851145, 0.64963735, 0.55842756,
        0.67772045],
       [0.12139483, 0.13346351, 0.11647083, 0.1065431 , 0.10489479,
        0.10418003],
       [0.1061324 , 0.1489759 , 0.06794598, 0.08987106, 0.09164514,
        0.08851301]])

In [95]:
 # Set the n_components=3
#pca  =PCA(n_components=1)
#projected_variable =pca.fit_transform(features_scaled)

In [96]:
#Customizar una variable para proyectar usando PCA (Principal Component Analysis)
#pca = PCA(n_components=1)
mapper = km.KeplerMapper(verbose=1)
projected_data = mapper.fit_transform(features_scaled, projection = [5]) # Promedio

KeplerMapper(verbose=1)
..Composing projection pipeline of length 1:
	Projections: [5]
	Distance matrices: False
	Scalers: MinMaxScaler()
..Projecting on data shaped (100, 6)

..Projecting data using: [5]

..Scaling with: MinMaxScaler()



In [106]:
covering=km.Cover(n_cubes=25,perc_overlap=0.4)
covering

Cover(n_cubes=25, perc_overlap=0.4, limits=None, verbose=0)

In [107]:
G = mapper.map(projected_data , features_scaled, clusterer=KMeans(n_clusters=3),cover=covering)

Mapping on data shaped (100, 6) using lens shaped (100, 1)

Creating 25 hypercubes.

Created 19 edges and 24 nodes in 0:00:00.149750.




In [108]:
mapper.visualize(G, 
                path_html = "signal_classifier.html",
                title='Signal Data',
                color_values = y,
                color_function_name = 'Tipo de señal',
                #node_color_function=np.array(['average','std','sum','max','min']))
);


Wrote visualization to: signal_classifier.html


In [75]:
cube1_cluster1 = datos_b1.loc[[0, 1, 11, 15, 29, 31, 45, 48, 49, 50, 54, 59, 61, 65, 68, 71, 72, 79, 84, 87, 93]]
cube1_cluster1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,231,232,233,234,235,236,237,238,239,tipo_señal
0,0.699886,0.673249,0.633735,0.607163,0.581062,0.44021,0.42723,0.384592,0.379101,0.370899,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.677619,0.671661,0.660272,0.617593,0.580183,0.510488,0.464481,0.43789,0.415127,0.396168,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.731365,0.698141,0.564119,0.542347,0.500152,0.47626,0.471536,0.44767,0.412011,0.407808,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,0.775419,0.680265,0.454023,0.442204,0.42914,0.39508,0.366099,0.361722,0.339771,0.33479,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29,0.708303,0.590241,0.533331,0.509515,0.471952,0.465069,0.436911,0.387608,0.369683,0.363782,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
31,0.606018,0.575366,0.546076,0.544704,0.495159,0.468371,0.454737,0.449365,0.435091,0.426117,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45,0.723857,0.511645,0.499891,0.489035,0.476363,0.47003,0.438924,0.422375,0.413281,0.407637,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48,0.696219,0.618173,0.539224,0.490262,0.408012,0.398252,0.385942,0.384291,0.383941,0.383898,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
49,0.572731,0.547718,0.535877,0.519229,0.491318,0.484636,0.473698,0.443262,0.438402,0.429615,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50,0.720984,0.519558,0.494582,0.492852,0.486618,0.429416,0.413095,0.406397,0.385881,0.361088,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [76]:
cube1_cluster1["tipo_señal"].value_counts(normalize=True)

1.0    0.666667
0.0    0.333333
Name: tipo_señal, dtype: float64

In [80]:
cube0_cluster0 = datos_b1.loc[[3, 4, 5, 6, 9, 12, 16, 18, 19, 20, 21, 23, 25, 26, 27, 28, 30, 32, 35, 36, 37, 38, 39, 42, 44, 46, 47, 49, 51, 52, 53, 57, 60, 62, 63, 64, 81, 82, 85, 89, 92, 94, 96, 97, 98]]
cube0_cluster0

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,231,232,233,234,235,236,237,238,239,tipo_señal
3,0.602958,0.506011,0.476734,0.424379,0.412444,0.410826,0.408294,0.393165,0.392688,0.384632,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.57869,0.522158,0.4282,0.407731,0.39946,0.361513,0.333549,0.326281,0.325906,0.314948,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,0.552357,0.487443,0.422451,0.408986,0.399443,0.39287,0.382605,0.38253,0.358681,0.348446,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.579677,0.565156,0.445428,0.439333,0.432906,0.42003,0.41482,0.374058,0.371835,0.369285,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.495428,0.485779,0.458985,0.447552,0.410092,0.380189,0.379969,0.340997,0.338303,0.3247,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.53603,0.526618,0.476196,0.467635,0.467511,0.436237,0.430693,0.406378,0.359762,0.34388,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.550927,0.544683,0.498911,0.477979,0.409102,0.405551,0.395569,0.362424,0.36176,0.348006,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
18,0.514047,0.452049,0.444575,0.429735,0.425331,0.409155,0.404699,0.389588,0.380412,0.376022,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,0.51926,0.489966,0.472609,0.456364,0.429751,0.404253,0.402516,0.400418,0.394568,0.392696,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
20,0.60138,0.534656,0.521672,0.489134,0.438742,0.415094,0.415061,0.406108,0.39923,0.391542,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [82]:
cube0_cluster0["tipo_señal"].value_counts(normalize=True)

1.0    0.555556
0.0    0.444444
Name: tipo_señal, dtype: float64