## KNN SCRATCH

In [None]:
def minkowski_distance(point1, point2,p):
    return np.power(sum(np.abs(point1-point2)**p),1/p)    

In [None]:
def predict_price(X_train,y_train,sample,k=5,p):
    distances = []
    for x in X_train.to_numpy():
        distance = minkowski_distance(sample,x,p)
        distances.append(distance)
    sorted_distances = np.argsort(distances)
    k_nearest_targets = y_train[sorted_distances[:k]]
    predict_price = np.mean(k_nearest_targets)
    return predict_price

In [None]:
def predict_class(X_train,y_train,sample,k=5,p):
    distances = []
    for x in X_train.to_numpy():
        distance = minkowski_distance(sample,x,p)
        distances.append(distance)
    sorted_distances = np.argsort(distances)
    k_nearest_targets = y_train[sorted_distances[:k]]
    class_counts = np.bincount(k_nearest_targets)
    prediction = agrmax(class_counts)
    return prediction

In [None]:
def calculate_mse_for_k(X_train,y_train,X_test,y_test,k,p):
    y_preds= []
    for sample in X_test.to_numpy():
        y_pred = predict_price(X_train,y_train,sample,k,p)
        y_preds.append(y_pred)
    mse = mean_squared_error(y_test,y_preds)
    return mse

## DT SCRATCH 

In [None]:
def get_best_split(data):
    x = data[:,0]
    y = data[:,1]
    split_pts = splits(x)
    results = {}
    for point in split_pts:
        x_true, y_true, x_false , y_false = split_data(x,y, point)
        errors = MAE(y, y_true, y_false)
        results[point] = errors
    best_point = min(results,key=results.get)
    return results,best_point

In [None]:
def splits(x):
    splits_points = []
    x_sorted = sorted(x)
    for i in range(len(x)-1):
        mid_point = (x_sorted[i] + x_sorted[i+1])/2
        splits_points.append(mid_point)
    return splits_points

In [None]:
def split_data(x,y,split_pt):
    mask = x > split_pt
    anti_mask = x < split_pt
    x_true = x[mask]
    y_true = y[mask]
    x_false = x[anti_mask]
    y_false = y[anti_mask]
    return x_true, y_true, x_false , y_false

In [None]:
x_true, y_true, x_false , y_false =split_data(int_twoDlist[:,0],int_twoDlist[:,1], 70)

In [None]:
def MAE(y, y_true, y_false):
    y_true_hat = np.mean(y_true)
    y_false_hat = np.mean(y_false)
    y_hat = np.mean(y)
        
    old_mae = np.mean(np.absolute(y-y_hat))
    new_mae = len(y_true)/ len(y) * np.mean(np.absolute(y_true-y_true_hat)) + \
              len(y_false)/ len(y) * np.mean(np.absolute(y_false-y_false_hat)) 
    return old_mae, new_mae  

In [3]:
#WITH FEATURES
def get_best_split_features(data):
    n_feature = data.shape[1] - 1 
    results = {}
    for feature_idx in range(n_feature):
        x = data[:,feature_idx]
        y = data[:,-1]
        split_pts = splits(x)
        for point in split_pts:
            x_true, y_true, x_false , y_false = split_data(x,y, point)
            errors = MAE(y, y_true, y_false)
            results[feature_idx,point] = errors
    best_point = min(results,key=results.get)
    return results,best_point

In [None]:
#split per column 
def splits_per_col(data):
    col_split_pts = {}
    ncols = data.shape[1]
    for c in range(ncols):
        split_pts = splits(data[:,c])
        col_split_pts['col_'+ str(c)] = split_pts
    return col_split_pts

## RF SCRATCH 

In [None]:
def simple_rf(data):
    bs,oob = bs_sampling(data)
    print(f"oob:",oob)
    total_features = data.shape[1]-1
    rand_n_of_feaures = int(np.sqrt(total_features))
    selected_features = rand.choice(total_features,rand_n_of_feaures,replace=False)
    selected_features_with_y = np.append(selected_features,-1)
    sample_data = bs[:,selected_features_with_y]
    dt = create_tree(sample_data)
    sample_oob = oob[:,selected_features]
    preds = dt.predict(sample_oob)
    return preds 

In [None]:
def bs_sampling(data):
    bs = rand.choice(data,data.shape[0],replace=True)
    unique_bs = np.unique(bs, axis=0)
    oob=[]
    for row in data:
        if sum(np.isin(row,unique_bs))==0:
            oob.append(row) 
    return bs,np.array(oob)

In [None]:
from sklearn.tree import DecisionTreeRegressor

def create_tree(data):
    X = data[:,0:-1]
    y = data[:,-1]
    dt = DecisionTreeRegressor()
    dt.fit(X,y)
    return dt    

In [None]:
###Extra average proportion of oob after x runs 
oob_samples =[]
for i in range(1,100):
    bs = rand.choice(data,data.shape[0],replace=True)
    n_unique_samples = np.unique(bs, axis=0).shape[0]
    oob = (data.shape[0] - n_unique_samples) / data.shape[0]  #proportion of ooob 
    oob_samples.append(oob)
np.mean(oob_samples)  

### FIRST TASTE ML 

In [None]:
X = rent[['bedrooms','bathrooms','latitude','longitude']]
y = rent['price']
import numpy as np
unseen_sample = np.array([2,2,40,-73])
preds = rf.predict(X)
e = mean_absolute_error(y,preds)
ep = e / y.mean() * 100 
print(f"${e:.0f} average error; {ep:.2f}% error")

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train , y_test = train_test_split(X,y, test_size = 0.25)
rf = RandomForestRegressor(n_estimators = 10)
rf.fit(X_train,y_train)
preds = rf.predict(X_test)
e = mean_absolute_error(y_test,preds)
ep = e / y.mean() * 100
print(f"${e:.0f} average error; {ep:.2f}% error")

In [None]:
#feature importance
from rfpimp import *
features_ranking = importances(rf_100, X_test, y_test)
features_ranking
plot_importances(features_ranking)
features_ranking = importances(rf_100, X_test, y_test , features = ['bedrooms','bathrooms',['longitude','latitude']])
plot_importances(features_ranking)

In [None]:
#Classification
from sklearn.datasets import load_breast_cancer
import pandas as pd
cancer = load_breast_cancer()
cancer.keys()
X = cancer.data
y = cancer.target
df = pd.DataFrame(X, columns = cancer.feature_names)
df.head().T
features = ['radius error','texture error','concave points error','symmetry error',
          'worst texture','worst smoothness','worst symmetry']
df= df[features]
df.head()
X_train, X_test, y_train , y_test = train_test_split(df,y, test_size = 0.20)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score , confusion_matrix
cl = RandomForestClassifier(n_estimators=200)
cl.fit(X_train,y_train)
preds = cl.predict(X_test)
e = accuracy_score(y_test,preds)
print(f"{e*100:.2f}% accuracy")
confusion_matrix(y_test,preds)
features_ranking = importances(cl, X_test, y_test)
features_ranking

In [None]:
df_num = df.select_dtypes(include=['number'])  #'number - int - float 
df_num
df_num2 = df_num.drop(['GarageYrBlt' , 'LotFrontage', 'MasVnrArea'], axis = 1)  
df_num2
df_nonum = df.select_dtypes(include=['object'])
df_nonum

### CATEGORICAL VAR


Create a baseline model and get oob score
This tells us that:
- $R^2 = 1$ means our model is perfect; 
- $R^2 \approx 0$ means our model does no better than just predicting the average;
- $R^2 \lt\lt 0$ means our model does worse than predicting the average.

In [8]:
from rfpimp import *

In [9]:
def evaluate(X, y):
    rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True)
    rf.fit(X, y)
    oob = rf.oob_score_
    n = rfnnodes(rf)
    h = np.median(rfmaxdepths(rf))
    print(f"OOB R^2 is {oob:.5f} using {n:,d} tree nodes with {h} median tree depth")
    return rf, oob

In [10]:
def showimp(rf, X, y):
    features = list(X.columns)
    features.remove('latitude')
    features.remove('longitude')
    features += [['latitude','longitude']]
    I = importances(rf, X, y, features=features)
    plot_importances(I, color='#4575b4')

In [None]:
evaluate(X, y)
showimp(rf, X, y)

##### Permutation Importance

We can calculate the feature importances using a permutation method, which consists of the following steps:
- use all features and establish a baseline value for $R^2$;
- select one feature and randomly permute its values leaving all other features unchanged;
- calculate the new value for $R^2$ with this one feature permuted;
- calculate the change in $R^2$ from the baseline; and, 
- repeat for the other features.

In [12]:
def perm_importances(X, y):
    rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True, random_state=999)
    rf.fit(X, y)
    r2 = rf.oob_score_
    print(f"Baseline R^2 with no columns permuted: {r2:.5f}\n")
    for col in X.columns:
        X_col = X.copy()
        X_col[col] = X_col[col].sample(frac=1).values
        rf.fit(X_col, y)
        r2_col = rf.oob_score_
        print(f"Permuting column {col}: new R^2 is {r2_col:.5f} and difference from baseline is {r2 - r2_col:.5f}")

In [None]:
perm_importances(X, y)

##### Dropped Column Importance

We can also calculate the importance of the features using a dropped column, which consists of the following steps:
- use all features and establish a baseline value for $R^2$;
- select one feature and remove it from the data;
- calculate the new value for $R^2$ with this one feature removed;
- calculate the change in $R^2$ from the baseline; and, 
- repeat for the other features.

In [15]:
def drop_importances(X, y):
    rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True, random_state=999)
    rf.fit(X, y)
    r2 = rf.oob_score_
    print(f"Baseline R^2 with no columns dropped: {r2:.5f}\n")
    for col in X.columns:
        X_col = X.copy()
        X_col = X_col.drop(col, axis=1) 
        rf.fit(X_col, y)
        r2_col = rf.oob_score_
        print(f"Dropping column {col}: new R^2 is {r2_col:.5f} and difference from baseline is {r2 - r2_col:.5f}")

Explain RF from scratch (importance of bootstrap):
Random Forest is an ensemble learning algorithm that consists of a collection of decision trees. Each tree is trained on a bootstrap sample of the data, where data points are randomly selected with replacement. Bootstrap sampling allows for diversity among the trees, which helps in reducing overfitting and improving generalization performance.

Explain OOB score (difference from R², why it's better):
The out-of-bag (OOB) score is an estimate of a model's accuracy using data points that were not included in the training set for each individual tree in the Random Forest. It serves as a validation metric during training and provides an unbiased estimate of the model's performance on unseen data. Unlike R², which measures the proportion of variance explained by the model, OOB score directly evaluates predictive accuracy on unseen data and is considered a better metric in certain scenarios, especially when cross-validation is computationally expensive or impractical.

Meaning of R² (when it's 0 the model is no better than an average):
R-squared (R²) quantifies the proportion of variance in the dependent variable that is explained by the independent variables in the regression model. When R² is 0, it indicates that the model does not explain any of the variability in the target variable and is no better than predicting the mean of the target variable.

Why create a baseline model (compare before and after denoising):
Creating a baseline model allows us to establish a point of reference for evaluating the effectiveness of subsequent model improvements, such as denoising. By comparing the performance metrics of the baseline model with those of the improved model, we can assess the impact of denoising techniques on model performance and determine whether the improvements are statistically significant.

How feature importance method works:
Feature importance measures the contribution of each feature to the predictive performance of the model. In Random Forest models, feature importance is calculated based on the decrease in node impurity (e.g., Gini impurity or entropy) when splitting on a particular feature. Features that result in the greatest decrease in impurity are considered the most important for making predictions.

What map/one-hot-encoding/dummy_variables do:
These techniques are used to convert categorical variables into numerical format, which is required for many machine learning algorithms. One-hot encoding and dummy variables create binary columns for each category in a categorical variable, while mapping assigns numerical values to categories. This allows models to interpret categorical data effectively.

Why accuracy is not appropriate compared to F1:
Accuracy measures the proportion of correctly classified instances in a classification task, but it can be misleading when classes are imbalanced. F1 score, on the other hand, considers both precision (true positive rate) and recall (sensitivity) and is more suitable for imbalanced datasets because it provides a balanced assessment of model performance across different classes.

Something about missing values (methods to deal with missing values, like replace with 0 or delete):
Dealing with missing values is essential to prevent bias and ensure the accuracy of models. Common methods include imputation (replacing missing values with a calculated estimate, such as the mean or median), deletion (removing rows or columns with missing values), or using advanced techniques like predictive modeling to estimate missing values based on other features in the dataset. Each method has its advantages and disadvantages depending on the specific context of the data.