In [22]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [23]:
data = pd.read_csv("0000000000002419_training_ccpp_x_y_train.csv")
df = pd.DataFrame(data)
df.head()

Unnamed: 0,# T,V,AP,RH,EP
0,8.58,38.38,1021.03,84.37,482.26
1,21.79,58.2,1017.21,66.74,446.94
2,16.64,48.92,1011.55,78.76,452.56
3,31.38,71.32,1009.17,60.42,433.44
4,9.2,40.03,1017.05,92.46,480.38


In [24]:
df.describe()

Unnamed: 0,# T,V,AP,RH,EP
count,7176.0,7176.0,7176.0,7176.0,7176.0
mean,19.629712,54.288154,1013.263032,73.275818,454.431293
std,7.475256,12.751468,5.964863,14.625093,17.134571
min,1.81,25.36,992.89,25.56,420.26
25%,13.47,41.74,1009.01,63.2025,439.7375
50%,20.315,52.05,1012.91,74.895,451.74
75%,25.72,66.54,1017.3025,84.925,468.6675
max,35.77,81.56,1033.3,100.16,495.76


In [25]:
def step_gradient(X, Y, learning_rate, m, c):
    m_slope = np.zeros(len(X.iloc[0]))
    c_slope = 0 
    N = len(X)
    for i in range(N):
        x = X.iloc[i]
        y = Y.iloc[i]
        pred = np.dot(m, x) + c
        error = y - pred
        m_slope += (-2 / N) * error * x
        c_slope += (-2 / N) * error
    new_m = m - learning_rate * m_slope
    new_c = c - learning_rate * c_slope
    return new_m, new_c

def gradient_descent(X, Y, learning_rate, num_iterations):
    m = np.zeros(X.shape[1])
    c = 0
    for i in range(0, num_iterations, 100):
        m, c = step_gradient(X, Y, learning_rate, m, c)
        print(i, "Cost: ", cost(X, Y, m, c))
    return m, c

def cost(X, Y, m, c):
    total_cost = 0
    N = len(X)
    for i in range(N):
        x = X.iloc[i]
        y = Y.iloc[i]
        pred = np.dot(m, x) + c
        total_cost += (1 / N) * ((y - pred) ** 2)
    return total_cost

def score(Y_true, Y_pred):
    u = ((Y_true - Y_pred) ** 2).sum()
    v = ((Y_true - (Y_true.mean())) ** 2).sum()
    return 1 - (u / v)

def feature_scaling(X):
    mean = np.mean(X, axis=0)
    std = np.std(X, axis=0)
    return (X - mean) / std



In [26]:
def run():
    data = pd.read_csv("0000000000002419_training_ccpp_x_y_train.csv")
    df = pd.DataFrame(data)
    X = df.iloc[:, :4]
    Y = df.iloc[:, -1]

    X = feature_scaling(X)
    
    x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=2)

    learning_rate = 0.05
    num_iterations = 15000
    m, c = gradient_descent(x_train, y_train, learning_rate, num_iterations)
    print("Trained parameters:", m, c)

    # Make predictions on the training set
    train_predictions = np.dot(x_train, m) + c
    train_score = score(y_train, train_predictions)
    print("Score on training set:", train_score)

    # Make predictions on the test set
    test_predictions = np.dot(x_test, m) + c
    test_score = score(y_test, test_predictions)
    print("Score on test set:", test_score)

    test_data = np.loadtxt("0000000000002419_test_ccpp_x_test.csv", delimiter=",")
    final_predictions = np.dot(test_data, m) + c

    print("Final Predictions:", final_predictions)
    np.savetxt("pred.csv", final_predictions)

    return final_predictions

In [27]:
predictions = run()
np.savetxt("pred.csv", predictions)

0 Cost:  167479.8285625605
100 Cost:  135618.5973422638
200 Cost:  109829.51869026976
300 Cost:  88951.22495204741
400 Cost:  72046.22759993022
500 Cost:  58357.01334562388
600 Cost:  47271.06536454129
700 Cost:  38292.864235975234
800 Cost:  31021.39977489769
900 Cost:  25132.065108365958
1000 Cost:  20362.05370580179
1100 Cost:  16498.567091734116
1200 Cost:  13369.283909029038
1300 Cost:  10834.65189840819
1400 Cost:  8781.651396562134
1500 Cost:  7118.747859419582
1600 Cost:  5771.805817251775
1700 Cost:  4680.780615366426
1800 Cost:  3797.039592350289
1900 Cost:  3081.1927680144304
2000 Cost:  2501.3360351820656
2100 Cost:  2031.6283596805274
2200 Cost:  1651.139453535013
2300 Cost:  1342.9164855722458
2400 Cost:  1093.2281829988192
2500 Cost:  890.9526004462331
2600 Cost:  727.0812467283615
2700 Cost:  594.31745241183
2800 Cost:  486.7510661217246
2900 Cost:  399.5949725495253
3000 Cost:  328.9716826709977
3100 Cost:  271.7404799351925
3200 Cost:  225.35741488513202
3300 Cost:  1