In [98]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# np.random.seed(0)
np.random.seed(69)
# Load training and testing data
X_train = np.loadtxt('X_train.csv', delimiter=',', skiprows=1)
X_test = np.loadtxt('X_test.csv', delimiter=',', skiprows=1)
y_train = np.loadtxt('y_train.csv', delimiter=',', skiprows=1)[:,1]

X, Xt, y, yt = train_test_split(X_train, y_train, test_size=0.2) # random_state=0

def rmse(y_true, y_pred, *args, **kwargs):
    return np.sqrt(mean_squared_error(y_true, y_pred, *args, **kwargs))

# Fit model and predict test values
y_pred = np.random.randint(y_train.min(), y_train.max(), X_test.shape[0])

X_test.shape

(41, 6)

In [99]:
# random forest reg
from sklearn.ensemble import RandomForestRegressor

reg = RandomForestRegressor(n_estimators=30, max_depth=None).fit(X, y)

yp = reg.predict(X)
print("train")
print(rmse(y, yp))
ytp = reg.predict(Xt)
print("test")
print(rmse(yt, ytp))

train
23.806444742901586
test
29.208163641299535


In [45]:
# simple lin reg
from sklearn.linear_model import LinearRegression

reg = LinearRegression(fit_intercept=True, normalize=True).fit(X, y)
print(reg.intercept_)
print(reg.coef_)
yp = reg.predict(X)
print("train")
print(reg.score(X, y))
print(rmse(y, yp))
ytp = reg.predict(Xt)
print("test")
print(reg.score(Xt, yt))
print(rmse(yt, ytp))

-46.02048895300834
[0.04258486 0.01720842 0.00372654 0.69482286 1.70354487 1.37634173]
train
0.8800740843262105
54.538244801464835
test
0.9099016192024215
36.15897266598111


In [192]:
# SVR
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler

scale = MinMaxScaler((-1, 1))

Xs = scale.fit_transform(X)
Xts = scale.fit_transform(Xt)

reg = SVR(kernel='poly', degree=5, tol=1e-6).fit(Xs, y)
# print(reg.coef_)
print(reg.intercept_)
yp = reg.predict(Xs)
print("train")
print(rmse(y, yp))
ytp = reg.predict(Xts)
print("test")
print(rmse(yt, ytp))

[58.51264669]
train
163.25917066676377
test
124.53793340374547


In [254]:
# ridge regression
from sklearn import linear_model


# X, Xt, y, yt = train_test_split(X_train, y_train, test_size=0.2)

# solver : {‘auto’, ‘svd’, ‘cholesky’, ‘lsqr’, ‘sparse_cg’, ‘sag’, ‘saga’}
reg = linear_model.Ridge(alpha=1, normalize=True, solver='auto').fit(X, y)

print(reg.coef_)
print(reg.intercept_)
yp = reg.predict(X)
print("train")
print(rmse(y, yp))
ytp = reg.predict(Xt)
print("test")
print(rmse(yt, ytp))

[-0.00742821  0.0093737   0.00289606  0.73241582  2.88636839  0.84145589]
0.7307008766147476
train
67.41697548743909
test
38.636265243684456


In [100]:
# Arrange answer in two columns. First column (with header "Id") is an
# enumeration from 0 to n-1, where n is the number of test points. Second
# column (with header "EpiOrStroma" is the predictions.
y_pred = reg.predict(X_test)

test_header = "Id,PRP"
n_points = X_test.shape[0]
y_pred_pp = np.ones((n_points, 2))
y_pred_pp[:, 0] = range(n_points)
y_pred_pp[:, 1] = y_pred
np.savetxt('reg1_sub.csv', y_pred_pp, fmt='%d,%f', delimiter=",",
           header=test_header, comments="")

# Note: fmt='%d' denotes that all values should be formatted as integers which
# is appropriate for classification. For regression, where the second column
# should be floating point, use fmt='%d,%f'.