In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge

from sklearn.metrics import r2_score

In [3]:
from sklearn.ensemble import RandomForestRegressor

In [4]:
df = pd.read_csv('./data/Galaxy.csv')

In [5]:
df.head()

Unnamed: 0,row.names,east.west,north.south,angle,radial.position,velocity
0,3,8.462789,-38.173172,102.5,39.099998,1769
1,4,7.964978,-35.927692,102.5,36.799999,1749
2,5,7.467167,-33.682213,102.5,34.500001,1749
3,6,6.969356,-31.436731,102.5,32.2,1758
4,7,6.471544,-29.19125,102.5,29.899999,1750


In [6]:
df.columns

Index(['row.names', 'east.west', 'north.south', 'angle', 'radial.position',
       'velocity'],
      dtype='object')

In [7]:
X = df[['east.west', 'north.south', 'angle', 'radial.position']]
y = df[['velocity']]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Linear Regression Model

In [9]:
reg = LinearRegression()

In [10]:
reg.fit(X_train, y_train)

LinearRegression()

In [11]:
pred_linReg = reg.predict(X_test)

In [12]:
r2_linReg = r2_score(y_test, pred_linReg)
r2_linReg

0.8948611191528733

In [13]:
linReg = {'r2_linReg': r2_linReg}

## Lasso Models L1 Regularization

In [14]:
reg_lso = Lasso()

In [15]:
reg_lso.fit(X_train, y_train)

Lasso()

In [16]:
pred_lso = reg_lso.predict(X_test)

In [17]:
r2_lasso = r2_score(y_test, pred_lso)
r2_lasso

0.8949810801127545

In [18]:
lasso = {'r2_lasso': r2_lasso}

### Reduce the amount of L1 Regularization

In [19]:
reg_lso_2 = Lasso(alpha=0.5)

In [20]:
reg_lso_2.fit(X_train, y_train)

Lasso(alpha=0.5)

In [21]:
pred_lso_2 = reg_lso_2.predict(X_test)

In [22]:
r2_lasso_2 = r2_score(y_test, pred_lso_2)
r2_lasso_2

0.894916215173488

In [23]:
lasso_2 = {'r2_lasso_2': r2_lasso_2}

## Ridge Model L2 Regularization

In [24]:
reg_ridge = Ridge()

In [25]:
reg_ridge.fit(X_train, y_train)

Ridge()

In [26]:
pred_ridge = reg_ridge.predict(X_test)

In [27]:
r2_ridge = r2_score(y_test, pred_ridge)
r2_ridge

0.8948629493551072

In [28]:
ridge = {'r2_ridge': r2_ridge}

### Reduced Ridge Regression

In [29]:
reg_ridge_2 = Ridge(alpha=0.5)

In [30]:
reg_ridge_2.fit(X_train, y_train)

Ridge(alpha=0.5)

In [31]:
pred_ridge_2 = reg_ridge_2.predict(X_test)

In [32]:
r2_ridge_2 = r2_score(y_test, pred_ridge_2)
r2_ridge_2

0.8948620343593974

In [33]:
ridge_2 = {'r2_ridge_2': r2_ridge_2}

## Compare non-linear model: Random Forest

In [34]:
reg_rf = RandomForestRegressor()

In [35]:
reg_rf.fit(X_train, y_train)
pred_reg_rf = reg_rf.predict(X_test)
r2_reg_rf = r2_score(y_test, pred_reg_rf)
r2_reg_rf

  reg_rf.fit(X_train, y_train)


0.9634351978503376

In [36]:
r2_rf = {'r2_reg_rf': r2_reg_rf}

## Compare the various models

In [37]:
obj = [linReg, lasso, lasso_2, ridge, ridge_2, r2_rf]

In [38]:
o2 = {}
names = []
for d in obj:
    for k,v in d.items():
        names.append(k)
        o2[k] = v

In [39]:
o2

{'r2_linReg': 0.8948611191528733,
 'r2_lasso': 0.8949810801127545,
 'r2_lasso_2': 0.894916215173488,
 'r2_ridge': 0.8948629493551072,
 'r2_ridge_2': 0.8948620343593974,
 'r2_reg_rf': 0.9634351978503376}

In [43]:
d2 = pd.DataFrame(o2, index=['R2 Galaxies'])

In [44]:
d2 = d2.T
d2

Unnamed: 0,R2 Galaxies
r2_linReg,0.894861
r2_lasso,0.894981
r2_lasso_2,0.894916
r2_ridge,0.894863
r2_ridge_2,0.894862
r2_reg_rf,0.963435


In [45]:
d3 = d2.sort_values(by='R2 Galaxies')
d3

Unnamed: 0,R2 Galaxies
r2_linReg,0.894861
r2_ridge_2,0.894862
r2_ridge,0.894863
r2_lasso_2,0.894916
r2_lasso,0.894981
r2_reg_rf,0.963435
