In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

<h2>Simple Quadratic Relationship Dataset</h2>
<h4>Hands-on: Linear Regression with AWS Machine Learning Service</h4>
Input Features: x<br>
Target Feature: y_noisy<br>
Objective: Predict y_noisy for a given x<br>

In [None]:
def quad_func (x):
    return 5*x**2 -23*x + 47

In [None]:
quad_func(25)

In [None]:
quad_func(1.254)

In [None]:
np.random.seed(5)
samples = 300
x_vals = pd.Series(np.random.uniform(-20,20, samples))
y_vals = x_vals.map(quad_func)
# Add random noise
y_noisy_vals = y_vals + np.random.randn(samples) * 3

In [None]:
df = pd.DataFrame({'x':x_vals,'y':y_vals, 'y_noisy':y_noisy_vals})

In [None]:
df.head()

In [None]:
# Correlation will indicate how strongly features are related to the output
df.corr()

In [None]:
plt.scatter(x=df.x,y=df.y,label='ideal fit')
plt.scatter(x=df.x,y=df.y_noisy, color='r',marker='+',label='Target')
plt.grid(True)
plt.xlabel('Input Feature')
plt.ylabel('Target')
plt.legend()

In [None]:
# Save all data
df.to_csv('quadratic_all.csv',index=False,
          columns=['x','y','y_noisy'])

## Training and Validation Set
### Target Variable as first column followed by input features: y_noisy, x
### Training, Validation files do not have a column header

In [None]:
# Training = 70% of the data
# Validation = 30% of the data
# Randomize the datset
np.random.seed(5)
l = list(df.index)
np.random.shuffle(l)
df = df.iloc[l]

In [None]:
rows = df.shape[0]
train = int(.7 * rows)
test = int(.3 * rows)

In [None]:
rows, train, test

In [None]:
# Write Training Set
df[:train].to_csv('quadratic_train.csv'
                          ,index=False,index_label='Row',header=False
                          ,columns=['y_noisy','x'])

In [None]:
# Write Validation Set
df[train:].to_csv('quadratic_validation.csv'
                          ,index=False,index_label='Row',header=False
                          ,columns=['y_noisy','x'])