# TRAINING AND REGRESSION

### Reading X and y

In [15]:
import pandas as pd  
import numpy as np
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model, metrics
import matplotlib.pyplot as plt


X = np.load("X.npy")
y = np.load("y.npy")

### Splitting and scaling data

In [34]:
scaler = StandardScaler()
X_train,X_val,y_train,y_val = model_selection.train_test_split(X,y,train_size=0.7,random_state=42)
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)





### Training the model

#### Linear Regression

In [61]:
regression = linear_model.LinearRegression()
regression.fit(X_train_scaled, y_train)
y_pred = regression.predict(X_val_scaled)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_val, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_val, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_val, y_pred)))

# ShuffleSplit Doesn't guarantee that all the test set will be different
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=3)
scores = model_selection.cross_val_score(regression, scaler.transform(X), y, cv=cv, scoring='mean_squared_error') 
print('Cross Validation Root Mean Squared Error:', np.sqrt(-scores).mean())


Mean Absolute Error: 25.775075696334596
Mean Squared Error: 1174.448806373062
Root Mean Squared Error: 34.27023207352209
Cross Validation Root Mean Squared Error: 34.16606184422998


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


#### Ridge Regression

In [63]:
ridge_reg = linear_model.RidgeCV(alphas=[1,1,1,0.1,0.1,0.1,0.01,0.01])
ridge_reg.fit(X_train_scaled, y_train)
y_pred = ridge_reg.predict(X_val_scaled)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_val, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_val, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_val, y_pred)))

# ShuffleSplit Doesn't guarantee that all the test set will be different
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=3)
scores = model_selection.cross_val_score(ridge_reg, scaler.transform(X), y, cv=cv, scoring='mean_squared_error') 
print('Cross Validation Root Mean Squared Error:', np.sqrt(-scores).mean())


Mean Absolute Error: 25.76839427546108
Mean Squared Error: 1173.360333720997
Root Mean Squared Error: 34.254347661588845
Cross Validation Root Mean Squared Error: 34.1694767961941


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


### Visual analysis

In [23]:
y_visual = pd.DataFrame(y_val,columns=["Real Price"])
y_visual["Predicted Price"] = y_pred
y_visual


Unnamed: 0,Real Price,Predicted Price
0,100.0,90.923610
1,85.0,98.581029
2,120.0,89.863419
3,100.0,94.794347
4,25.0,85.275835
5,85.0,89.456533
6,90.0,85.334005
7,120.0,103.991083
8,100.0,87.741117
9,25.0,81.365245
