# Regression
- code-source:
1. [Linear reg](https://www.kaggle.com/code/abdulbasitniazi/linear-regression-pytorch-for-beginner-s/notebook)
2. [Multiple reg](https://www.kaggle.com/code/nadzmiagthomas/pytorch-regression)

- [knowledge](https://www.geeksforgeeks.org/machine-learning/regression-in-machine-learning/)

## pre-process

In [None]:
!pip install kaggle

In [None]:
#from google.colab import files
#files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
import numpy as np
#data vis:
import pandas as pd
#import seaborn as sns
import missingno as msno #visualize missing vals
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib

import warnings
warnings.filterwarnings('ignore')

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as f #activation func
from torch.utils import data
from torch.utils.data import DataLoader, TensorDataset, SubsetRandomSampler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

## Linear regression

### Data processing

In [None]:
!kaggle datasets download -d mirichoi0218/insurance

In [None]:
!unzip random-linear-regression.zip -d random-linear-regression/

In [None]:
import os
for dirname, _, filenames in os.walk('/content/insurance_data'):
  for filename in filenames:
    print(os.path.join(dirname, filename))

In [None]:
# Load the dataset
df = pd.read_csv('/content/insurance_data/insurance.csv')

# Split the data into training and testing sets
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Remove missing values -> dropna
train = train.dropna()
test = test.dropna()

# Splitting labels, features (columns)
x_train = train.iloc[:, :-1].values # Select all columns except the last one as features
y_train = train.iloc[:, -1].values.reshape(-1, 1) # Select the last column as the target

x_test = test.iloc[:, :-1].values
y_test = test.iloc[:, -1].values.reshape(-1, 1)

In [None]:
train.shape, test.shape

In [None]:
train.describe().T, test.describe().T

In [None]:
# Select only numeric columns for training and testing
x_train_numeric = train.select_dtypes(include=np.number).iloc[:, :-1].values
y_train_numeric = train.select_dtypes(include=np.number).iloc[:, -1].values.reshape(-1, 1)

x_test_numeric = test.select_dtypes(include=np.number).iloc[:, :-1].values
y_test_numeric = test.select_dtypes(include=np.number).iloc[:, -1].values.reshape(-1, 1)


x_train_tensor = torch.from_numpy(x_train_numeric).float()
y_train_tensor = torch.from_numpy(y_train_numeric).float()

x_test_tensor = torch.from_numpy(x_test_numeric).float()
y_test_tensor = torch.from_numpy(y_test_numeric).float()

dataset_train = TensorDataset(x_train_tensor, y_train_tensor)
dataset_test = TensorDataset(x_test_tensor, y_test_tensor)

train_loader = DataLoader(dataset=dataset_train, batch_size=50)
test_loader = DataLoader(dataset=dataset_test, batch_size=60)

### model

In [None]:
#generic function
def make_train_step(model, loss_fn, optimizer):
  def train_step(x,y):
    model.train()
    yhat = model(x)
    loss = loss_fn(y, yhat)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    return loss.item()
  return train_step

In [None]:
device = 'cpu'
lr = 1e-6
epochs = 1000
from sklearn.metrics import r2_score

model = nn.Sequential(nn.Linear(3,1)).to(device) # Corrected input features
loss_fn = nn.MSELoss(reduction='mean')
optimizer = optim.SGD(model.parameters(), lr=lr) # Called the parameters() method

In [None]:
#trainning
train_step = make_train_step(model, loss_fn, optimizer)
trainning_losses = []
test_losses = []
accuracies = []

for epoch in range(epochs):
  batch_losses = [] # Initialize batch_losses here
  for batch, (x_batch, y_batch) in enumerate(train_loader):
    x_batch = x_batch.to(device)
    y_batch = y_batch.to(device)

    loss = train_step(x_batch, y_batch)
    batch_losses.append(loss)

  trainning_loss = np.mean(batch_losses)
  trainning_losses.append(trainning_loss)

  # evaluation
  test_batch_losses = [] # Initialize test_batch_losses here
  test_accuracies = [] # Initialize test_accuracies here
  for x_test_batch, y_test_batch in test_loader: # Corrected test_loder to test_loader
    x_test_batch = x_test_batch.to(device)
    y_test_batch = y_test_batch.to(device)
    model.eval() #dfisable dropout, use learned param for batch norm, disable grade
    yhat = model(x_test_batch)
    test_loss = loss_fn(y_test_batch, yhat)
    test_batch_losses.append(test_loss.item()) # Append test loss
    acc = r2_score(y_test_batch.cpu().detach().numpy(), yhat.cpu().detach().numpy()) # Added .detach()
    test_accuracies.append(acc) # Append accuracy

  test_loss = np.mean(test_batch_losses) # Calculate mean outside the batch loop
  test_losses.append(test_loss)
  acc_batch = np.mean(test_accuracies) # Calculate mean outside the batch loop
  accuracies.append(acc_batch)


  if epoch %50 ==0:
    print(f'epochs: {epoch} || trainning loss: {trainning_loss:.4f}, test loss: {test_loss:.4f} || R2: {acc_batch:.4f}')

In [None]:
# plotting res:
print(model.state_dict())
plt.style.use('ggplot') # Changed style to ggplot
plt.figure(figsize=(8,6))
# Select the first feature (age) for plotting
plt.scatter(x_test[:, 0], y_test, s=10) # Plotted first feature of x_test
# Plot the regression line using the first feature of x_test
plt.plot(x_test[:, 0], model(torch.from_numpy(x_test).float().to(device))[:, 0].cpu().detach().numpy(), 'r', linewidth=0.4) # Plotted first feature of model output
plt.xlabel('Age') # Added x-label
plt.ylabel('Charges') # Added y-label
plt.title('Age vs Charges with Linear Regression') # Added title
plt.show() # Added show() to display the plot

## Multiple regression

### Data processing

In [None]:
import kagglehub
kagglehub.login()

In [None]:
!unzip house-prices-advanced-regression-techniques.zip -d house_prices_data/

In [None]:
import os
for dirname, _, filenames in os.walk('/content/sample_data/'):
  for filename in filenames:
    print(os.path.join(dirname, filename))

In [None]:
df_train_raw = pd.read_csv('/content/sample_data/california_housing_test.csv')
df_train_raw.head() #first 5 rows*

In [None]:
df_train_raw.shape

In [None]:
sns.histplot(df_train_raw['median_house_value'], kde=True)

- using log -later- to scale down the label as it's waay skewed

In [None]:
msno.matrix(df_train_raw) #check missing vals

In [None]:
#preprocessing func
def preprocess(df):
  df = df.copy() #ain't ruining original data frame
  df= df.drop(columns=['id'], errors='ignore')
  numeric_cols = df.select_dtypes(np.number).columns
  numeric_cols = numeric_cols.drop('SalePrice', errors='ignore')
  categorical_cols = df.select_dtypes(include=['object']).columns
  df_categorical = pd.get_dummies(df[categorical_cols], dummy_na=True)
  df_numerical = df[numeric_cols].apply(lambda x: (x - x.mean()) / (x.std()))
  df_numerical = df_numerical.fillna(0)
  df = pd.concat([df_numerical, df_categorical], axis=1)

  return df

In [None]:
import os
os.environ["PYDEVD_DISABLE_FILE_VALIDATION"] = "1"
from torch.cuda.amp import autocast, GradScaler



In [None]:
df_train_y = df_train_raw["median_house_value"].copy()
df_train_y = np.log(df_train_y, where=df_train_y!=0) #to normalize the skewed data using log, !=0 cus log(0) => error.

# Basic preprocessing for this dataset: drop non-numeric columns for now
df_train_x = df_train_raw.select_dtypes(include=np.number).drop(columns=['median_house_value'])

In [None]:
df_train_x.head()

In [None]:
sns.histplot(df_train_y, kde=True)

In [None]:
df_train_x, df_val_x, df_train_y, df_val_y = train_test_split(df_train_x, df_train_y, train_size=0.8, random_state=42)

### model building

In [None]:
class Model(nn.Module):
  def __init__(self, D_in=331, H=222, D_out=1, Hn=4):
    super.__init__()
    self.Hn=Hn
    self.activation=nn.Softplus()
    self.layers = nn.ModuleList([nn.Linear(D_in, H), self.activation])
    self.layer.extend([nn.Linear(H,H), self.activation])
    self.layers.append([H,D_out])

  def forward(self, x):
    for layer in self.layers:
      x = layer(x)
    return x


In [None]:
def set_seed(seed):
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.backends.cudnn.deterministic = True
  torch.use_deterministic_algorithms = True

set_seed(42)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() lse 'cpu')
print(device)

In [None]:
#data for validation
y_val = torch.tensor(df_val_y.values).float().to(device).unsqueeze(1)
x_val = torch.tensor(df_val_x.values).float().to(device)

#data for batch train
y_train = torch.tensor(df_train.values).float().to(device).unsqueeze(1)
x_train = torch.tensor(df_train_x.values).float().to(device)

dataset = TensorDataset(x_train, y_train) #turn x,y into data set to iterate it w dataloaader
dataloader = Dataloader(dataset, batch_size=16, shuffle=True)
model = Model().to(device)
optimizer = torch.optim.SGD(model.parameters, lr=.002, momentum=0.9, weight_decay=0.001)
criterion = n.MSELoss()

In [None]:
train_losses=[]
val_losses = []
epochs = 250

for epoch in range(epochs):
  running_loss= 0.0

  for batch, (x,y) in enumerate(dataloader):
    y_pred = model(x)
    loss = criterion(y_pred, y)
    running_loss += loss.item()
    #bw prop
    optimizer.zero_grad() #empty grad
    loss.backward() #bw prop => gradient
    optimizer.step() #adjust params

  #for each epoch => 1 val loss + 1 avg batch loss
  train_losses.append(running_loss/(batch+1)) #avg loss of batch to the whole losses

  y_pred = model(x_val)
  val_loss = criterion(y_pred, y_val)
  val_losses.append(val_loss)


In [None]:
#model evaluation
def plot_loss(losses, axes=None, epoch_start=0):
  sns.set_theme(style="darkgrid")
  x = [i for i in range(1+epoch_start, len(losses)+1)]
  sns.plot(ax=axes, x=x, y=losses[epoch_start:])

def plot_epoch_loss(train_loss, test_loss, epoch1=0, epoch2=10, epoch3=50, epoch4=150):
  fig, axes = plt.subplots(2,2,figsize=(12,6), constrained_layout=True)
  axes[0][0].set_title('epch start at'+ str(epoch1))
  plot_loss(train_losses, axes[0][0], epoch1)
  plot_loss(test_losses, axes[0][0], epoch1)

  axes[0][1].set_title('epch start at'+ str(epoch2))
  plot_loss(train_losses, axes[0][1], epoch2)
  plot_loss(test_losses, axes[0][1], epoch2)

  axes[1][0].set_title('epch start at'+ str(epoch3))
  plot_loss(train_losses, axes[1][0], epoch3)
  plot_loss(test_losses, axes[1][0], epoch3)

  axes[1][1].set_title('epch start at'+ str(epoch4))
  plot_loss(train_losses, axes[1][1], epoch4)
  plot_loss(test_losses, axes[1][1], epoch4)

In [None]:
plot_epoch_loss(train_losses, val_losses)

In [None]:
#calc the errors
def numpy_error(model, x, y, exp=True):
  y_pred = model(x)
  if exp:
    y = torch.exp(y)
    y_pred = torch.exp(y_pred)

  errors = (y_pred - y).flatten().detach().cpu().numpy()
  return errors

#calc MAE mean abs error
def mea(errors):
  return np.abs(errors).mean()

#calc RMSE
def rmse(errors):
  mse = ((errors)**2).mean
  return np.sqrt(mse)

In [None]:
errors = numpy_error(model, x_train, y_train, False)
print(f'error on train: MAE: {mae(errors)} \nRMSE: {rmse(errors)}')

errors = numpy_error(model, x_val, y_val, False)
print(f'error on test: MAE: {mae(errors)} \nRMSE: {rmse(errors)}')