<a href="https://colab.research.google.com/github/CarlosMejia07/Quices-Lab/blob/main/QuicesLab3y4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Nueva sección

Lab 3.1 Manually use a predictive model

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
from local.lib import mlutils
from IPython.display import Image
%matplotlib inline

In [None]:
d = pd.read_csv("local/data/trilotropicos_small.csv")
X,y = d.values[:,:2], d.values[:,-1]
print (d.shape, X.shape, y.shape)
print (X[:5])
print (y[:5])
d.head()

In [None]:
plt.scatter(X[y==0][:,0], X[y==0][:,1], color="blue", label="X bug")
plt.scatter(X[y==1][:,0], X[y==1][:,1], color="red", label="Z bug")
plt.xlabel("width");plt.ylabel("length"); plt.legend(); plt.grid();

In [None]:
def predict(X, t):
    return (~((X[:,0] < t[0]) & (X[:,1] > t[1]))).astype(int)

In [None]:
t = np.r_[.5,.3]
y_hat = predict(X, t)
y_hat
np.mean(y==y_hat)
mlutils.plot_2Ddata_with_boundary(lambda X: predict(X,t), X, y); plt.grid();
t = np.r_[.5,.8]
mlutils.plot_2Ddata_with_boundary(lambda X: predict(X,t), X, y); plt.grid();
np.mean(y==predict(X,t))
from sklearn.linear_model import LogisticRegression
mlutils.plot_2Ddata_with_boundary(LogisticRegression().fit(X,y).predict, X, y); plt.grid();
from sklearn.tree import DecisionTreeClassifier
mlutils.plot_2Ddata_with_boundary(DecisionTreeClassifier(max_depth=5).fit(X,y).predict, X, y); plt.grid();
from sklearn.svm import SVC
mlutils.plot_2Ddata_with_boundary(SVC(gamma=50).fit(X,y).predict, X, y); plt.grid();

Lab 3.2 Fit the model

In [None]:
import itertools
def fit(X,y):
    def predict(X, t):
        return (~((X[:,0] < t[0]) & (X[:,1] > t[1]))).astype(int)
    grid = np.linspace(0, 1, 11)
    best_acc = -1
    best_t = None
    for t0, t1 in itertools.product(grid, grid):
        t = np.array([t0, t1])
        y_pred = predict(X, t)
        acc = (y_pred == y).mean()
        if acc > best_acc:
            best_acc = acc
            best_t = t

    return best_t

In [None]:
t = fit(X,y)
mlutils.plot_2Ddata_with_boundary(lambda X: predict(X,t), X, y); plt.grid();
np.mean(y==predict(X,t))
from sklearn.datasets import make_blobs
from sklearn.preprocessing import MinMaxScaler

bX, by = make_blobs(100,n_features=2, centers=2)
bX = MinMaxScaler(feature_range=(0.1,.9)).fit_transform(bX)
bt = fit(bX, by)
mlutils.plot_2Ddata_with_boundary(lambda X: predict(X,bt), bX, by); plt.grid();
np.mean(by==predict(bX,bt))

Lab 3.3 Make an sklearn compatible class with your model

In [None]:
def SimpleModel():
    class _SimpleModel:

        def __init__(self):
            self.t = None  # aquí guardaremos [θ0, θ1]

        def fit(self, X, y):
            grid = np.linspace(0, 1, 11)
            best_acc = -1

            for t0, t1 in itertools.product(grid, grid):
                t = np.array([t0, t1])
                y_pred = self._predict_with_t(X, t)
                acc = (y_pred == y).mean()
                if acc > best_acc:
                    best_acc = acc
                    self.t = t  # guardamos los mejores parámetros

            return self

        def predict(self, X):
            return self._predict_with_t(X, self.t)

        # función auxiliar privada (usa t explícito)
        def _predict_with_t(self, X, t):
            return (~((X[:,0] < t[0]) & (X[:,1] > t[1]))).astype(int)


    return _SimpleModel()

In [None]:
m = SimpleModel()
m.fit(X,y)
m.predict(X)
mlutils.plot_2Ddata_with_boundary(m.predict, X, y); plt.grid();
np.mean(y==m.predict(X))

from sklearn.datasets import make_moons

mX, my = make_moons(100, noise=.1)
m = SimpleModel()
m.fit(mX,my)

mlutils.plot_2Ddata_with_boundary(m.predict, mX, my); plt.grid();
np.mean(my==m.predict(mX))

Lab 3.21 Timeseries model - Build a time series training dataset

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from local.lib import timeseries as ts
import pandas as pd
import os
from IPython.display import Image
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
%matplotlib inline

In [None]:
date_split = "2018-03-01"

idx = pd.date_range("2018-01-01", "2018-03-31", freq="6h")
i = np.linspace(-5,4,len(idx))
i = np.linspace(np.random.random()*5-5,np.random.random()*5+2,len(idx))
t = np.log(i**2+.3)*np.cos(4*i)
t += (np.random.normal(size=len(idx))*.4)
t = np.round(t*3+30,3)
d = pd.DataFrame(np.r_[[t]].T, columns=["signal"], index=idx)
d.index.name="date"

plt.figure(figsize=(15,3))
plt.plot(d[:date_split].index, d[:date_split].signal, color="black", lw="2", label="train");
plt.plot(d[date_split:].index, d[date_split:].signal, color="red", lw="2", label="test");
plt.axvline(date_split, color="grey"); plt.legend();plt.grid();
signal = d

In [None]:
def make_timeseries_dataset(signal, n_timesteps_lookback):
    import pandas as pd
    r = signal.copy()
    for i in range(1, n_timesteps_lookback + 1):
        r[f"signal-{i}"] = r["signal"].shift(i)
    r["signal+1"] = r["signal"].shift(-1)
    r = r.dropna()
    return r

In [None]:
make_timeseries_dataset(d, n_timesteps_lookback=3).head(10)


Lab 3.22 Manually apply a regression model to create predictions

In [None]:
def apply_linear_regression_model(td, w):
    r = w[0] + td[["signal"] + sorted([c for c in td.columns if c.startswith("signal-")], key=lambda x: int(x.split("-")[1]))].values @ w[1:]
    return r

In [None]:
td = make_timeseries_dataset(d, n_timesteps_lookback=np.random.randint(3)+2)
td = td[np.random.permutation(td.columns)]
td.head()
w = np.random.random(len(td.columns))
w
apply_linear_regression_model(td[np.random.permutation(td.columns)], w)[:5]

Lab 3.23 Measure trend prediction

In [None]:
def measure_trend_accuracy(td, preds):
    r = (( (td["signal+1"] > td["signal"]) & (preds > td["signal"]) ) |
            ( (td["signal+1"] <= td["signal"]) & (preds <= td["signal"]) ) ).mean()
    return r

In [None]:
td = make_timeseries_dataset(d, n_timesteps_lookback=np.random.randint(3)+2).iloc[:10]
td
preds = td['signal'] + np.round(np.random.random()*4-2,3)
preds
measure_trend_accuracy(td, preds)

Lab 4.1 Cleaning Data - FillNA in risk with corresponding city average

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import Image
import numpy as np
import seaborn as sns

In [None]:
n = 20
place = np.r_[["Medellin", "Bogota", "Madrid"]][(np.random.randint(3, size=n))]
age = np.random.randint(50, size=n)+10
children = np.r_[[(np.random.randint(2) if i<30 else (np.random.randint(4))) for i in age]]
risk = np.r_[[np.random.random()*(.2 if i=="Medellin" else .8) for i in place]].round(3)
risk[np.random.permutation(len(risk))[:5]]=np.nan
d01 = pd.DataFrame([age, risk, children, place], index=["age", "risk", "children", "place"]).T
d01.to_csv("risk.csv", index=False)
d01
#k = d01[d01.place=="Bogota"]["risk"].dropna()
#plt.scatter(k, [0]*len(k), label="Bogota")
#k = d01[d01.place=="Medellin"]["risk"].dropna()
#plt.scatter(k, [1]*len(k), label="Medellin")
#k = d01[d01.place=="Madrid"]["risk"].dropna()
#plt.scatter(k, [2]*len(k), label="Madrid")
#plt.grid();
#plt.xlabel("risk level")
#plt.ylabel("city")
#plt.legend()


In [None]:
# Crear DataFrame
r01 = pd.DataFrame({"place": place, "age": age, "children": children, "risk": risk})

# Calcular media por ciudad y reemplazar los NaN
r01["risk"] = r01["risk"].fillna(r01.groupby("place")["risk"].transform("mean"))

r01.head()

Lab 4.2 Standardize age so that min=0, max=1

si=(xi−min)/(max−min)

In [None]:
r02 = r01.copy()
r02["age"] = (r02["age"] - r02["age"].min()) / (r02["age"].max() - r02["age"].min())

Lab 4.3 Standardize age so that  μ=0  and  σ=1

si=(si−μ)/σ

In [None]:
r03 = r01.copy()
r03["age"] = (r03["age"] - r03["age"].mean()) / r03["age"].std()

Lab 4.4 Create a one-hot encoding for place

In [None]:
r04 = pd.get_dummies(r03, columns=["place"])

Lab 4.21 Buiding Datasets

In [5]:
from IPython.display import Image
# from local.lib import labutils # Removing this as local files are not accessible
import numpy as np
# bid, date = labutils.biddate_for_student(student.user_id) # Removing this as labutils is not available
# print ("your building_id", bid)
# print ("your date       ", date)

In [3]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = '.'
!chmod 600 ./kaggle.json
!kaggle competitions download -c ashrae-energy-prediction
!unzip ashrae-energy-prediction.zip > /dev/null
!wc *.csv

ashrae-energy-prediction.zip: Skipping, found more recently modified local copy (use --force to force download)
      1450       2177      45527 building_metadata.csv
  41697601   41697601  447562511 sample_submission.csv
  41697601   83395201 1462461085 test.csv
  20216101   40432201  678616640 train.csv
    277244     554487   14787908 weather_test.csv
    139774     279547    7450075 weather_train.csv
 104029771  166361214 2610923746 total


In [6]:
import pandas as pd

# --- 1. Cargar los archivos ---
train = pd.read_csv("train.csv", parse_dates=["timestamp"])
buildings = pd.read_csv("building_metadata.csv")
weather = pd.read_csv("weather_train.csv", parse_dates=["timestamp"])

# --- 2. Filtrar el edificio y el día ---
b_id = 921
date = "2016-02-14"

# Filtramos solo el edificio y día, y solo meter=0
df = train[
    (train["building_id"] == b_id) &
    (train["meter"] == 0) &
    (train["timestamp"].dt.date == pd.to_datetime(date).date())
].copy()

# --- 3. Agregar metadata y clima ---
# Añadir info del edificio
df = df.merge(buildings, on="building_id", how="left")

# Añadir clima (por site_id y timestamp)
df = df.merge(weather, on=["site_id", "timestamp"], how="left")

# --- 4. Seleccionar solo las columnas solicitadas ---
cols = [
    "meter_reading", "site_id", "air_temperature", "cloud_coverage",
    "dew_temperature", "precip_depth_1_hr", "sea_level_pressure",
    "wind_direction", "wind_speed", "square_feet", "year_built"
]
df = df[cols]

# --- 5. Rellenar valores faltantes con 0 ---
df = df.fillna(0)

# --- 6. Calcular las sumas ---
target_sum = int(df["meter_reading"].sum())
features_sum = int(df.drop(columns=["meter_reading"]).sum().sum())

print(f"Target sum (meter_reading): {target_sum}")
print(f"Features sum: {features_sum}")
print(f"Total rows: {len(df)}")

Target sum (meter_reading): 10087
Features sum: 6319125
Total rows: 24


Lab 4.22 Time series missing data fix

In [None]:
from local.lib import labutils
_, date = labutils.biddate_for_student(student.user_id)
print ("your date       ", date)

In [7]:
import pandas as pd

# Cargar datos meteorológicos
weather = pd.read_csv("weather_train.csv", parse_dates=["timestamp"])

# Filtrar por site_id y fecha
site_id = 3
date = "2016-02-14"

df = weather[
    (weather["site_id"] == site_id) &
    (weather["timestamp"].dt.date == pd.to_datetime(date).date())
].copy()

# Ordenar por hora para asegurar el orden temporal
df = df.sort_values("timestamp")

# Extraer la columna de interés
ts = df["cloud_coverage"]

# Rellenar valores NaN repitiendo el último valor conocido
fixed_ts = ts.ffill().tolist()

# Si los primeros valores son NaN y no hay previo, se pueden rellenar con 0 o el primer valor no nulo:
if pd.isna(fixed_ts[0]):
    first_valid = next((x for x in fixed_ts if pd.notna(x)), 0)
    fixed_ts = [first_valid if pd.isna(x) else x for x in fixed_ts]

print(fixed_ts)
print(f"Longitud: {len(fixed_ts)}")

[0.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0, 0.0, 2.0, 2.0, 0.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 4.0, 4.0, 6.0, 6.0, 6.0]
Longitud: 24


Lab 4.23 Build a time series predictive dataset

In [8]:
import numpy as np
ts = np.array(fixed_ts)
n = 3  # número de valores previos usados para predecir el siguiente

# Construir las ventanas (inputs)
X = np.array([ts[i:i+n] for i in range(len(ts)-n)])
# Construir los valores esperados (outputs)
y = np.array([ts[i+n] for i in range(len(ts)-n)])

print("X shape:", X.shape)
print("y shape:", y.shape)
print("\nEjemplo X[0]:", X[0])
print("Ejemplo y[0]:", y[0])

X shape: (21, 3)
y shape: (21,)

Ejemplo X[0]: [0. 2. 2.]
Ejemplo y[0]: 2.0
