In [1]:
import numpy as np
import pandas as pd
# from sklearn import svm
from scipy.stats import kurtosis
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
import math

# inicializacion de las variables
presion = np.zeros(0)
altitud = np.zeros(0)
humedad = np.zeros(0)
temperatura = np.zeros(0)


def buildDataF(dataF):
    dfTemp = pd.DataFrame()

    # Divide el dataframe en n grupos, n viene siendo una division del tamño de renglones entre 10
    for data in np.array_split(dataF, math.floor(np.shape(dataF)[0]/10)):
        # Calculos de la presion
        presion = data["Presion"].values
        presion = presion[np.logical_not(np.isnan(presion))]
        if presion.any():
            presionMean = presion.mean()
            presionStd = np.std(presion)
            presionKrt = kurtosis(presion)

            # Calculos de la altitud
            altitud = data["Altitud"].values
            altitud = altitud[np.logical_not(np.isnan(altitud))]
            altitudMean = altitud.mean()
            altitudStd = np.std(altitud)
            altitudKrt = kurtosis(altitud)

            # Calculos de la humedad
            humedad = data["Humedad"].values
            humedad = humedad[np.logical_not(np.isnan(humedad))]
            humedadMean = humedad.mean()
            humedadStd = np.std(humedad)
            humedadKrt = kurtosis(humedad)

            # Calculos de la temperatura
            temperatura = data["Temperatura"].values
            # temperaturaNan = data["Temperatura"].values
            temperatura = temperatura[np.logical_not(np.isnan(temperatura))]
            temperaturaMean = temperatura.mean()
            temperaturaStd = np.std(temperatura)
            temperaturaKrt = kurtosis(temperatura)
        else:
            print("something is null")

        # Junta los datos que se meteran en un renglon del dataframe
        data = [[presionMean, presionStd, presionKrt,
                altitudMean, altitudStd, altitudKrt,
                humedadMean, humedadStd, humedadKrt,
                temperaturaMean, temperaturaStd, temperaturaKrt, dataF.Ocupacion.iloc[0]]]

        names = ["presionMean", "presionStd", "presionKrt",
                "altitudMean", "altitudStd", "altitudKrt",
                "humedadMean", "humedadStd", "humedadKrt",
                "temperaturaMean", "temperaturaStd", "temperaturaKrt", "Ocupacion"]

        # Une este renglon al dataframe que se enviara al final
        dfTemp = dfTemp.append(pd.DataFrame(data, columns=names))

    return dfTemp

In [2]:
df = pd.read_csv("Sensado_GYM_Completo.csv", usecols=[
                 "Fecha", "Presion", "Altitud", "Humedad", "Temperatura", "Ocupacion"])
df = df.dropna()

# Filtra el dataframe para solo contenga el día
df['Fecha'] = pd.to_datetime(df['Fecha']).dt.strftime(
    '%d')  # dt.strftime('%d/%m/%Y %H:%M')

dfLow = df[df["Ocupacion"] == 'L']
dfMed = df[df["Ocupacion"] == 'M']
dfHigh = df[df["Ocupacion"] == 'H']

dfFinal = pd.DataFrame()

while not dfLow.empty:  # Mientras no esten vacias
    # obtiene solo los valores con la primera fecha y lo envía a la funcion para
    dfFinal = dfFinal.append(buildDataF(
        dfLow[dfLow["Fecha"] == dfLow.iloc[0, 0]]))
                                                                    # que construya el dataframe y lo una al dataframe que tendra todos los datos
    # Remueve todos los datos que contengan la primera fecha
    dfLow = dfLow[dfLow.Fecha != dfLow.iloc[0, 0]]

# print(dfFinal)

while not dfMed.empty:  # Mientras no esten vacias
    # obtiene solo los valores con la primera fecha y lo envía a la funcion para
    dfFinal = dfFinal.append(buildDataF(
        dfMed[dfMed["Fecha"] == dfMed.iloc[0, 0]]))
                                                    # que construya el dataframe
    # Remueve todos los datos que contengan la primera fecha
    dfMed = dfMed[dfMed.Fecha != dfMed.iloc[0, 0]]

while not dfHigh.empty:  # Mientras no esten vacias
    # obtiene solo los valores con la primera fecha y lo envía a la funcion para
    dfFinal = dfFinal.append(buildDataF(
        dfHigh[dfHigh["Fecha"] == dfHigh.iloc[0, 0]]))
                                                    # que construya el dataframe
    # Remueve todos los datos que contengan la primera fecha
    dfHigh = dfHigh[dfHigh.Fecha != dfHigh.iloc[0, 0]]


In [3]:
dfFinal

Unnamed: 0,presionMean,presionStd,presionKrt,altitudMean,altitudStd,altitudKrt,humedadMean,humedadStd,humedadKrt,temperaturaMean,temperaturaStd,temperaturaKrt,Ocupacion
0,95173.331818,1.502754,-1.181816,525.237273,0.132534,-1.182429,61.012727,0.045345,-0.999435,27.100000,0.004264,2.500000,L
0,95172.134545,1.494355,-0.115611,525.342727,0.128847,-0.140854,60.891818,0.061173,0.388098,27.084545,0.004979,-1.966667,L
0,95173.137273,1.788164,-1.130988,525.255455,0.156517,-1.143531,60.926364,0.117958,-1.309023,27.093636,0.008814,-0.768900,L
0,95173.272000,1.152457,-0.648982,525.244000,0.100020,-0.669249,61.243000,0.044057,-0.952888,27.122000,0.007483,-1.153061,L
0,95174.345000,1.178196,-0.216864,525.150000,0.103150,-0.276097,61.254000,0.072691,0.651563,27.145000,0.005000,-2.000000,L
...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,94919.749000,1.037771,0.726515,547.474000,0.090133,0.642208,56.430000,0.037417,-1.128571,29.711000,0.024678,-1.079554,H
0,94913.905000,4.538525,-0.613220,547.986000,0.399079,-0.620622,56.635000,0.111915,0.683361,29.741000,0.022561,-1.051355,H
0,94910.914000,1.502912,1.235757,548.249000,0.133300,1.294699,56.410000,0.180222,-1.750375,29.721000,0.009434,-0.028532,H
0,94910.961000,1.590273,-1.501453,548.228000,0.139341,-1.491765,56.011000,0.448273,-1.481737,29.654000,0.031686,-1.218330,H


In [4]:
# ---------------------------------- Clasificacion de instancias usando una SVM-------------------------------#
df = dfFinal

training_set, test_set=train_test_split(df, test_size=0.2, random_state=1)

X_train = training_set.iloc[ :, 0:12 ].values
Y_train = training_set.iloc[ :, 12 ].values
X_test = test_set.iloc[ :, 0:12 ].values
Y_test = test_set.iloc[ :, 12 ].values

In [5]:
#----------------- PCA --------------------------#
pca = PCA(n_components=3) # Aca podemos se puede varian el num de comp para buscar el mejor balance (accuracy - n_of_components)

print("xtrain.SHAPE antes del PCA: ", X_train.shape)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
for n in pca.explained_variance_ratio_:
    print( n )
print("xtrain.SHAPE despues del PCA: ", X_train.shape)

xtrain.SHAPE antes del PCA:  (807, 12)
0.9983601405746273
0.0012757490461757655
0.00017369534729943067
xtrain.SHAPE despues del PCA:  (807, 3)


In [6]:
# ------------------ Oversampling ---------------------------------#
# print("Group By: \n",df.groupby('Ocupacion').count())
print("Antes del resampling")
print("Xtrain: ", np.size(X_train))
print("Ytrain: ", np.size(Y_train))

Antes del resampling
Xtrain:  2421
Ytrain:  807


In [7]:
#------------------Tecnicas de Resampling---------------------------#
# sm = RandomOverSampler(random_state = 0)
# sm = SMOTE(random_state = 0)
sm=ADASYN(random_state=0)
X_train, Y_train=sm.fit_sample(X_train, Y_train)

print("Despues del resampling")
print("Xtrain: ", np.size(X_train))
print("Ytrain: ", np.size(Y_train))

classifier=SVC(kernel='rbf', random_state=1)
classifier.fit(X_train, Y_train)
Y_pred=classifier.predict(X_test)

test_set["Predictions"]=Y_pred

cm=confusion_matrix(Y_test, Y_pred)
print("CM SVM: \n", cm)
accuracy=float(cm.diagonal().sum())/len(Y_test)

print("\nAccuracy Of SVM: ", accuracy, "\n")

Despues del resampling
Xtrain:  3843
Ytrain:  1281
CM SVM: 
 [[30 19  0]
 [ 0 54  0]
 [ 0  0 99]]

Accuracy Of SVM:  0.905940594059406 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [8]:
# ---------------------------------- Clasificacion de instancias usando una Weighted KNN-------------------------------#
classifier=KNeighborsClassifier(n_neighbors=15)
classifier.fit(X_train, Y_train)

Y_pred=classifier.predict(X_test)

test_set["Predictions"]=Y_pred

cm=confusion_matrix(Y_test, Y_pred)
print("CM KNN: \n", cm)
accuracy=float(cm.diagonal().sum())/len(Y_test)

print("\nAccuracy Of WKNN: ", accuracy, "\n")

CM KNN: 
 [[43  6  0]
 [ 3 51  0]
 [ 0  0 99]]

Accuracy Of WKNN:  0.9554455445544554 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [9]:
# ---------------------------------- Clasificacion de instancias usando una Random Forest-------------------------------#
classifier=RandomForestClassifier(
    n_estimators=100, max_depth=2, random_state=0)

classifier.fit(X_train, Y_train)

Y_pred=classifier.predict(X_test)


# print(classifier.feature_importances_)
cm=confusion_matrix(Y_test, Y_pred)
print("CM RF: \n", cm)
accuracy=float(cm.diagonal().sum())/len(Y_test)

print("\nAccuracy Of RF: ", accuracy, "\n")

CM RF: 
 [[16 28  5]
 [ 9 45  0]
 [15 18 66]]

Accuracy Of RF:  0.6287128712871287 

