In [1]:
import numpy as np
from matplotlib.cm import ScalarMappable
import matplotlib.pyplot as plt
import pickle
import csv

In [2]:
n = 20 # Samples
m = 5   # Features

### Ejercicio 1: Normalización

In [3]:
def z_score(data):
    mu = np.mean(data,axis=0)
    s = np.std(data,axis=0)
    return (data-mu)/s

In [4]:
raw_data = np.random.uniform(low=0., high=100., size=(n,m))

print("raw_data:")
print(raw_data)
print("\nz_score(row_data):")
print(z_score(raw_data))

raw_data:
[[66.22530199 21.48968688 92.12262192 60.83010203  2.90627139]
 [ 5.15565561  9.03021729 69.35782157 92.32209899 80.0404363 ]
 [48.31157841 26.17918858  6.04208661 93.36896706 95.21412824]
 [11.54621655  8.94518893 27.20091044 97.78073771 68.44859134]
 [29.66733977 58.33918456 77.26557674 45.66324025 23.72643303]
 [82.66096189  3.03034782 13.33240102 49.02138926  5.57498645]
 [20.52515538 19.7471349  92.81114849  3.88930712 80.13590577]
 [18.25313422 82.53102992 82.16121593  4.97351504 51.09523854]
 [11.31216042 18.70672532 81.1404418  95.99194861 35.1226027 ]
 [50.3250608  96.27354423 22.09614859 17.2667588  28.17328034]
 [78.70600157  6.91739943  0.50090567 30.74955062 58.71053292]
 [21.93828821 80.29544278 83.8674341  25.6539376  56.58335008]
 [ 0.77795332 55.73983671 91.98087852  5.39780178 93.1712913 ]
 [19.39112261 60.62860971 87.98105507 55.71291241 14.64979884]
 [29.93743491  6.21245766 10.08199327 53.24086448 45.62126718]
 [10.10704039 41.5263623  68.16037958 30.0664

### Ejercicio 2: Remover filas y columnas con NaNs en un dataset

In [5]:
def remove_nan(data,axis=0):
    # axis: 0 delete row; 1 delete col; -1 delete both
    if axis == 1 or axis == 2:
        nan_values = np.argwhere(np.isnan(data))[:,axis]
        return np.delete(data,nan_values,axis=axis)
    if axis == -1:
        nan_row = np.argwhere(np.isnan(data))[:,0]
        nan_col = np.argwhere(np.isnan(data))[:,1]
        data = np.delete(data,nan_row,axis=0)
        return np.delete(data,nan_col,axis=1)
    return data

In [6]:
raw_data = np.random.uniform(low=0., high=100., size=(n,m))
raw_data[np.random.choice(raw_data.shape[0], 3, replace=False),
         np.random.choice(raw_data.shape[1], 3, replace=False)] = np.nan

print("raw_data:")
print(raw_data)
print("\nremove rows with nan:")
print(remove_nan(raw_data))
print("\nremove cols with nan:")
print(remove_nan(raw_data,axis=1))
print("\nremove both with nan:")
print(remove_nan(raw_data,axis=-1))

raw_data:
[[23.20300922 17.02339726 94.47678114 95.48174874 34.18366095]
 [49.76321503 40.97962999 65.07629899         nan 60.95142422]
 [ 5.20847421 49.94664738  4.83279259 35.78120582 83.71301127]
 [31.22075022 39.35920252  4.25992228 86.73868296  6.53073004]
 [10.2474854          nan 34.61455123 34.27580718 27.80112036]
 [64.63339349  7.87198596 19.2804688  82.94815302 82.36597888]
 [43.69871827 35.25034661 53.47497025 70.84343108         nan]
 [34.9794164  65.0114345  62.87219192 68.05013261  2.08638596]
 [85.53248529  7.87499761 34.43450652 41.86246039 18.3025156 ]
 [28.0397521   4.70794403  8.36992418 90.48253341 76.50488715]
 [ 2.13252713 40.01429529 32.87949545 42.47847379 72.69498986]
 [74.47393696 56.99882577 61.01650876 80.85879624 49.01505409]
 [20.77795887 51.28204578 68.90493471 71.40877816 66.3657499 ]
 [55.71867555 68.02950175  6.5949601  22.66908402 15.87815371]
 [52.46710135 53.16035392 76.48786513 31.60821909 95.96159906]
 [12.37232031 11.7222518  53.69401017 87.1577

### Ejercicio 3: Reemplazar NaNs por la media de la columna

In [7]:
def nan2mean(data):
    nan_values = np.argwhere(np.isnan(data))
    nan_cols = nan_values[:,1]
    mu = np.nanmean(data,axis=0)
    
    print("\nmu = "+str(mu)+"\n")
    
    for i in nan_values:
        data[tuple(i)]=mu[i[1]]
        
    return np.array(data)

In [8]:
raw_data = np.random.uniform(low=0., high=100., size=(n,m))
raw_data[np.random.choice(raw_data.shape[0], 10, replace=True),
         np.random.choice(raw_data.shape[1], 10, replace=True)] = np.nan

print("raw_data:")
print(raw_data)
print("\nremove rows with nan:")
print(nan2mean(raw_data))

raw_data:
[[47.7672396  73.20680021 15.32966327 78.93804348  5.68950668]
 [75.8907377  63.59367533 97.5712286  57.92219336  0.65270007]
 [58.50055201 39.29156846 80.63090516 49.54351937 92.38912868]
 [93.1450585  51.35655736 48.54879241 39.00530418 56.48278841]
 [87.25066421         nan 71.12964247         nan 10.99047463]
 [49.32802875  1.61028799 84.4586977  68.21444115         nan]
 [96.55816459 58.17984037 59.109651   95.7770222  73.43529396]
 [97.76471076 20.37448664 78.11025569 93.16947391 49.25709938]
 [        nan 17.8835761  77.68389486         nan 71.50349252]
 [76.10319779  3.71914605 80.42457535 85.66620517 86.02242063]
 [31.26099702 89.40146911 28.04052877  5.04641313 48.43598202]
 [79.7729821  58.48732057  2.53715977 38.06502777 46.91948558]
 [72.71628588         nan 40.89017446 15.01700883 77.97626847]
 [65.84789111         nan 93.95464165         nan 96.15805033]
 [75.68209219 12.11916983 19.56666449 68.36588343 23.21722512]
 [71.60559471 79.81557649 50.18731169 42.0694

### Ejercicio 4: Dado un dataset X separarlo en 70 / 20 / 10

In [9]:
def split(data,test=.1,val=.2):  
    total_len = raw_data.shape[0]
    test_len = int(test*total_len)
    val_len = int(val*total_len)
    train_len = int(total_len-(test_len+val_len))
    
    np.random.shuffle(data)        
    
    return np.array(data[0:test_len]), np.array(data[test_len:test_len+val_len]), np.array(data[test_len+val_len:total_len])                      

In [10]:
raw_data = np.random.uniform(low=0., high=100., size=(20,m))

train, val, test = split(raw_data)
print("Shape raw_data: "+str(raw_data.shape))
print("Shape train: "+str(train.shape))
print("Shape val: "+str(val.shape))
print("Shape test: "+str(test.shape))

Shape raw_data: (20, 5)
Shape train: (2, 5)
Shape val: (4, 5)
Shape test: (14, 5)


### Ejercicio 5: A partir del dataset de consigna, aplicar los conceptos de regresión lineal.