<a href="https://colab.research.google.com/github/25je0551-spec/WOC-ML/blob/main/wocML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!mkdir data models utils



mkdir: cannot create directory â€˜dataâ€™: File exists
mkdir: cannot create directory â€˜modelsâ€™: File exists
mkdir: cannot create directory â€˜utilsâ€™: File exists


In [2]:
!ls


 binary_submission.csv		   models		  test_nn.csv
 data				   sample_data		  utils
'Linear Regression Test (1).csv'   test_binary.csv
 linear_submission.csv		   test_multi_class.csv


In [None]:
from google.colab import files
files.upload()


In [None]:
!mv *.csv data/
!ls data


In [None]:
%%writefile utils/preprocessing.py
import numpy as np

def normalize(X):
    return (X - X.mean(axis=0)) / (X.std(axis=0) + 1e-8)

def train_test_split(X, y, test_size=0.2):
    idx = np.arange(len(X))
    np.random.shuffle(idx)
    split = int(len(X) * (1 - test_size))
    return X[idx[:split]], X[idx[split:]], y[idx[:split]], y[idx[split:]]


In [None]:
%%writefile utils/preprocessing.py
import numpy as np

def normalize(X):
    """
    Standard normalization: (X - mean) / std
    """
    return (X - X.mean(axis=0)) / (X.std(axis=0) + 1e-8)

def train_test_split(X, y, test_size=0.2):
    """
    Manual train-validation split
    """
    idx = np.arange(len(X))
    np.random.shuffle(idx)

    split = int(len(X) * (1 - test_size))
    train_idx = idx[:split]
    val_idx = idx[split:]

    return X[train_idx], X[val_idx], y[train_idx], y[val_idx]


In [None]:
!ls utils


In [None]:
%%writefile models/linear_regression.py
import numpy as np

class LinearRegression:
    def __init__(self, lr=0.01, epochs=1000):
        self.lr = lr
        self.epochs = epochs

    def fit(self, X, y):
        self.w = np.zeros(X.shape[1])
        self.b = 0.0

        for _ in range(self.epochs):
            y_pred = X @ self.w + self.b
            dw = (-2 / len(y)) * X.T @ (y - y_pred)
            db = (-2 / len(y)) * np.sum(y - y_pred)

            self.w -= self.lr * dw
            self.b -= self.lr * db

    def predict(self, X):
        return X @ self.w + self.b


In [None]:
%%writefile models/polynomial_regression.py
import numpy as np
from models.linear_regression import LinearRegression

class PolynomialRegression(LinearRegression):
    def __init__(self, degree=2, lr=0.01, epochs=1000):
        self.degree = degree
        super().__init__(lr, epochs)

    def transform(self, X):
        return np.hstack([X ** i for i in range(1, self.degree + 1)])

    def fit(self, X, y):
        X_poly = self.transform(X)
        super().fit(X_poly, y)

    def predict(self, X):
        return super().predict(self.transform(X))


In [None]:
%%writefile models/logistic_regression.py
import numpy as np

class LogisticRegression:
    def __init__(self, lr=0.01, epochs=1000):
        self.lr = lr
        self.epochs = epochs

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        self.w = np.zeros(X.shape[1])
        self.b = 0.0

        for _ in range(self.epochs):
            z = X @ self.w + self.b
            y_pred = self.sigmoid(z)

            dw = X.T @ (y_pred - y) / len(y)
            db = np.mean(y_pred - y)

            self.w -= self.lr * dw
            self.b -= self.lr * db

    def predict(self, X):
        probs = self.sigmoid(X @ self.w + self.b)
        return (probs >= 0.5).astype(int)


In [None]:
%%writefile models/logistic_regression.py
import numpy as np

class LogisticRegression:
    def __init__(self, lr=0.01, epochs=1000):
        self.lr = lr
        self.epochs = epochs

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        self.w = np.zeros(X.shape[1])
        self.b = 0.0

        for _ in range(self.epochs):
            z = X @ self.w + self.b
            y_pred = self.sigmoid(z)

            dw = X.T @ (y_pred - y) / len(y)
            db = np.mean(y_pred - y)

            self.w -= self.lr * dw
            self.b -= self.lr * db

    def predict(self, X):
        probs = self.sigmoid(X @ self.w + self.b)
        return (probs >= 0.5).astype(int)


In [None]:
%%writefile models/neural_network.py
import numpy as np

class NeuralNetwork:
    def __init__(self, layers, lr=0.01):
        self.layers = layers
        self.lr = lr
        self.weights = []
        self.biases = []

        for i in range(len(layers) - 1):
            w = np.random.randn(layers[i], layers[i+1]) * np.sqrt(2 / layers[i])
            b = np.zeros((1, layers[i+1]))
            self.weights.append(w)
            self.biases.append(b)

    def relu(self, x):
        return np.maximum(0, x)

    def relu_deriv(self, x):
        return x > 0

    def softmax(self, z):
        exp = np.exp(z - np.max(z, axis=1, keepdims=True))
        return exp / np.sum(exp, axis=1, keepdims=True)

    def forward(self, X):
        self.a = [X]
        self.z = []

        for i in range(len(self.weights) - 1):
            z = self.a[-1] @ self.weights[i] + self.biases[i]
            self.z.append(z)
            self.a.append(self.relu(z))

        z = self.a[-1] @ self.weights[-1] + self.biases[-1]
        self.z.append(z)
        self.a.append(self.softmax(z))

        return self.a[-1]

    def backward(self, X, y):
        m = len(X)
        dz = self.a[-1] - y

        for i in reversed(range(len(self.weights))):
            dw = self.a[i].T @ dz / m
            db = np.sum(dz, axis=0, keepdims=True) / m

            self.weights[i] -= self.lr * dw
            self.biases[i] -= self.lr * db

            if i > 0:
                dz = (dz @ self.weights[i].T) * self.relu_deriv(self.z[i-1])

    def fit(self, X, y, epochs=1000):
        for _ in range(epochs):
            self.forward(X)
            self.backward(X, y)

    def predict(self, X):
        return np.argmax(self.forward(X), axis=1)


In [None]:
import pandas as pd
from models.linear_regression import LinearRegression
from utils.preprocessing import normalize, train_test_split

df = pd.read_csv("data/Linear Regression Train.csv")

X = normalize(df.iloc[:, :-1].values)
y = df.iloc[:, -1].values

X_train, X_val, y_train, y_val = train_test_split(X, y)

model = LinearRegression()
model.fit(X_train, y_train)

preds = model.predict(X_val)
print("MSE:", ((y_val - preds) ** 2).mean())


In [None]:
from models.polynomial_regression import PolynomialRegression

df = pd.read_csv("data/poly_train.csv")

X = normalize(df.iloc[:, :-1].values)
y = df.iloc[:, -1].values

X_train, X_val, y_train, y_val = train_test_split(X, y)

model = PolynomialRegression(degree=3)
model.fit(X_train, y_train)

preds = model.predict(X_val)
print("MSE:", ((y_val - preds) ** 2).mean())


In [None]:
from models.logistic_regression import LogisticRegression

df = pd.read_csv("data/train_binary.csv")

X = normalize(df.iloc[:, :-1].values)
y = df.iloc[:, -1].values

X_train, X_val, y_train, y_val = train_test_split(X, y)

model = LogisticRegression()
model.fit(X_train, y_train)

acc = (model.predict(X_val) == y_val).mean()
print("Accuracy:", acc)


In [None]:
import pandas as pd

df = pd.read_csv("data/train_multi_class.csv")

print(df.isna().sum())


In [None]:
df = df.dropna(subset=[df.columns[-1]])


In [None]:
import numpy as np
import pandas as pd
from utils.preprocessing import normalize
from models.neural_network import NeuralNetwork


df = pd.read_csv("data/train_multi_class.csv")


df = df.dropna(subset=[df.columns[-1]])


X = normalize(df.iloc[:, :-1].values)
labels = df.iloc[:, -1].values


unique_labels = np.unique(labels)
label_to_int = {label: i for i, label in enumerate(unique_labels)}
y_raw = np.array([label_to_int[l] for l in labels])


num_classes = len(unique_labels)
y = np.eye(num_classes)[y_raw]


nn = NeuralNetwork(
    layers=[X.shape[1], 64, 32, num_classes],
    lr=0.01
)

nn.fit(X, y, epochs=1500)


preds = nn.predict(X)
accuracy = (preds == y_raw).mean()
print("Accuracy:", accuracy)


In [None]:
from google.colab import files
files.upload()


In [None]:
!mv *_test.csv data/
!ls data


In [None]:
import numpy as np

def fit_normalize(X):
    """
    Compute mean and std from training data
    """
    mean = X.mean(axis=0)
    std = X.std(axis=0) + 1e-8
    return mean, std

def apply_normalize(X, mean, std):
    """
    Apply training normalization to new data
    """
    return (X - mean) / std


In [None]:
import numpy as np
import pandas as pd
from google.colab import files


In [None]:
from models.linear_regression import LinearRegression
import pandas as pd

df_train_lin = pd.read_csv("Linear Regression Test (1).csv")

X_train_lin = df_train_lin.iloc[:, :-1].values
y_train_lin = df_train_lin.iloc[:, -1].values

# ðŸ”´ THIS CREATES mean_lin and std_lin
mean_lin, std_lin = fit_normalize(X_train_lin)

X_train_lin = apply_normalize(X_train_lin, mean_lin, std_lin)

linear_model = LinearRegression()
linear_model.fit(X_train_lin, y_train_lin)

print("Linear Regression trained successfully")


In [None]:
df_test_lin = pd.read_csv("Linear Regression Test (1).csv")

X_test_lin = df_test_lin.iloc[:, :-1].values
X_test_lin = apply_normalize(X_test_lin, mean_lin, std_lin)

# If target exists
if df_test_lin.shape[1] > X_test_lin.shape[1]:
    y_test_lin = df_test_lin.iloc[:, -1].values
    preds_lin = linear_model.predict(X_test_lin)
    mse = np.mean((y_test_lin - preds_lin) ** 2)
    print("Linear Regression Test MSE:", mse)
else:
    preds_lin = linear_model.predict(X_test_lin)

print("Linear predictions ready")


In [None]:
linear_submission = pd.DataFrame({
    "id": np.arange(len(preds_lin)),
    "prediction": preds_lin
})

linear_submission.to_csv("linear_submission.csv", index=False)
print("linear_submission.csv created")


In [None]:
from models.logistic_regression import LogisticRegression
import pandas as pd

df_train_bin = pd.read_csv("test_binary.csv")

X_train_bin = df_train_bin.iloc[:, :-1].values
y_train_bin = df_train_bin.iloc[:, -1].values

# ðŸ”´ CREATE mean_bin and std_bin HERE
mean_bin, std_bin = fit_normalize(X_train_bin)

X_train_bin = apply_normalize(X_train_bin, mean_bin, std_bin)

logistic_model = LogisticRegression(lr=0.01, epochs=2000)
logistic_model.fit(X_train_bin, y_train_bin)

print("Binary Logistic Regression trained successfully")


In [None]:
df_test_bin = pd.read_csv("test_binary.csv")

X_test_bin = df_test_bin.iloc[:, :-1].values
X_test_bin = apply_normalize(X_test_bin, mean_bin, std_bin)

y_test_bin = df_test_bin.iloc[:, -1].values

preds_bin = logistic_model.predict(X_test_bin)
accuracy = (preds_bin == y_test_bin).mean()

print("Binary Test Accuracy:", accuracy)


In [None]:
binary_submission = pd.DataFrame({
    "id": np.arange(len(preds_bin)),
    "prediction": preds_bin
})

binary_submission.to_csv("binary_submission.csv", index=False)
print("binary_submission.csv created")


In [None]:
from models.neural_network import NeuralNetwork
import pandas as pd
import numpy as np

# Load training data
df_train_nn = pd.read_csv("test_multi_class.csv")

# Drop rows with missing labels
df_train_nn = df_train_nn.dropna(subset=[df_train_nn.columns[-1]])

X_train_nn = df_train_nn.iloc[:, :-1].values
labels_train = df_train_nn.iloc[:, -1].values

# ðŸ”´ CREATE mean_nn and std_nn HERE
mean_nn, std_nn = fit_normalize(X_train_nn)
X_train_nn = apply_normalize(X_train_nn, mean_nn, std_nn)

# Encode labels
unique_labels = np.unique(labels_train)
label_to_int = {label: i for i, label in enumerate(unique_labels)}
y_raw = np.array([label_to_int[l] for l in labels_train])

# One-hot encoding
num_classes = len(unique_labels)
y_train_nn = np.eye(num_classes)[y_raw]

# Initialize and train NN
nn = NeuralNetwork(
    layers=[X_train_nn.shape[1], 64, 32, num_classes],
    lr=0.01
)

nn.fit(X_train_nn, y_train_nn, epochs=1500)

print("Neural Network trained successfully")


In [None]:
df_test_nn = pd.read_csv("test_multi_class.csv")

# Remove rows with missing labels (if any)
df_test_nn = df_test_nn.dropna(subset=[df_test_nn.columns[-1]])

X_test_nn = df_test_nn.iloc[:, :-1].values
X_test_nn = apply_normalize(X_test_nn, mean_nn, std_nn)

labels_test = df_test_nn.iloc[:, -1].values
y_test_nn = np.array([label_to_int[l] for l in labels_test])

preds_nn = nn.predict(X_test_nn)
accuracy_nn = (preds_nn == y_test_nn).mean()

print("Multiclass NN Test Accuracy:", accuracy_nn)


In [None]:
multiclass_submission = pd.DataFrame({
    "id": np.arange(len(preds_nn)),
    "prediction": preds_nn
})

multiclass_submission.to_csv("multiclass_submission.csv", index=False)
print("multiclass_submission.csv created")


In [None]:
files.download("linear_submission.csv")
files.download("binary_submission.csv")
files.download("multiclass_submission.csv")
