In [29]:
import pandas as pd
import numpy as np
import csv
import random as rnd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn import metrics

%matplotlib inline

## Read file into dataframe

In [30]:
df = pd.read_csv("regLinPoli2.csv") ##insert your own path

In [31]:
#df = pd.concat([rand_list, df], axis=1)

## Divide in train and test

In [32]:
X_train, X_test, Y_train, Y_test = train_test_split(df[df.columns[0:-1]],df[[df.columns[-1]]], train_size=0.75)
#print X_train.columns.values

### I recommend that after manipulating data using pandas and before modelling to convert dataframes into arrays. This may avoid some headaches

In [33]:
X_train=np.asarray(X_train)
X_test=np.asarray(X_test)
Y_train=np.asarray(Y_train)
Y_test=np.asarray(Y_test)

## Procedure for data standardizing

In [34]:
# This procedure is useful for classroom examples. For a real implementation you have to have a separete method 
# for transforming the production data so you can transform it as you get it with the fitted scaler
## The procedure returns a standardized copy of the input data
def normalize(X_train,X_test,Y_train,Y_test,do=True):

    scale_X=preprocessing.StandardScaler()
    scale_y=preprocessing.StandardScaler()
    
    train_X=np.copy(X_train)
    train_y=np.copy(Y_train)
    test_X=np.copy(X_test)
    test_y=np.copy(Y_test)
    if do:
        scale_X.fit(train_X)
        scale_y.fit(train_y)
        train_X=scale_X.transform(train_X)
        train_y=scale_y.transform(train_y)
        test_X=scale_X.transform(test_X)
        test_y=scale_y.transform(test_y)
    return train_X,test_X, train_y, test_y

## Incremental regularized regression procedures

In [35]:
## Transfer function
def salida(w,X):
    return X.dot(w[1:]) +w[0]


In [36]:
## Training function
def entrena(X,y,w,la=0.0,eta=0.01):
    
    for i in range(len(X)):
        error=y[i]-salida(w,X[i])
        w[0]=w[0]+eta*(error)
        w[1:]=w[1:]+eta*(error*X[i])-la*w[1:]
    return w


## Error function

In [37]:
def calcError(X,y,w,w0):
    return np.mean((X.dot(w)+w0-y)**2)

## Use

In [38]:
train_X,test_X,train_y,test_y=normalize(X_train,X_test,Y_train,Y_test)

In [39]:
w=np.asarray([rnd.random() for i in range(1+len(train_X[0]))])
for i in range(100):
    w=entrena(train_X,train_y,w,la=0.00)

In [40]:
w

array([  5.32866731e-01,   8.01385175e-01,  -1.59955472e-01,
         4.54698212e-01,   4.96854188e-01,  -1.22208008e-02,
         3.18332916e-01,   8.50688918e-01,   4.65150391e-01,
         5.42568684e-01,   1.36960029e-01,   1.17324321e-01,
        -4.80001362e-02,   7.70212422e-03,   8.48405852e-02,
        -2.54769310e-02,   6.20170116e-02,  -2.47577420e-02,
         3.96171879e-02,  -6.13357566e-02,  -1.23355201e-01,
         1.42851892e-02,  -7.81925902e-02,  -1.45863330e-01,
        -2.86996850e-02,  -2.79304437e-02,   7.25170911e-02,
        -1.99799595e-02,  -1.08183575e-02,  -3.29038252e-02,
         1.51879769e-02,   4.62537502e-02,  -3.30947725e-02,
         7.00845625e-04,  -1.00666164e-01,   3.07621964e-02,
        -7.33265416e-02,  -1.35586400e-01,   3.29201547e-02])

In [41]:
## flatten here to convert y from a matrix to a vector. Only 1 response variable
print calcError(train_X,train_y.flatten(),w[1:],w[0])
print calcError(test_X,test_y.flatten(),w[1:],w[0])

0.545525704555
0.612329117539


In [42]:
train_X = pd.DataFrame(train_X)
train_Y = pd.DataFrame(train_y)
#X_train

In [43]:
concat = pd.concat([train_X, train_Y], axis = 1)
concat_names = range(39)
concat.columns = [concat_names]
#concat_names

In [44]:
rand_list = pd.DataFrame([[rnd.randint(1, 6)] for iter_num in range(len(concat))])
cross_table = pd.concat([rand_list, concat], axis=1)

In [45]:
cross_names = range(40)
cross_table.columns = [cross_names]


In [71]:
from sklearn.metrics import mean_squared_error
train_X_1 = []
train_Y_1 = []
val_X_1 = []
val_Y_1 = []
err1_2 = 0
err2_2 = 0
lambdas = np.array([-0.002,0.0015,0.01])
w=np.asarray([rnd.random() for i in range(38)])
for j in lambdas:
    for i in range(1, 7):
        train_1 = cross_table[cross_table[0] != i]
        val_1 = cross_table[cross_table[0] == i]
        train_1 = (train_1.drop([0], axis=1))
        val_1 = (val_1.drop([0], axis=1))
        train_X_1 = np.asarray(train_1[train_1.columns[1:-1]])
        train_Y_1 = np.asarray(train_1[[train_1.columns[-1]]])
        val_X_1 = np.asarray(val_1[val_1.columns[1:-1]])
        val_Y_1 = np.asarray(val_1[[val_1.columns[-1]]])
        entrena(train_X_1,train_Y_1,w,j,eta=0.01)
        err1 = calcError(train_X_1,train_Y_1.flatten(),w[1:],w[0])
        err2 = calcError(val_X_1,val_Y_1.flatten(),w[1:],w[0])
    err1_2 = err1 / 6
    err2_2 = err2 / 6
    print err1_2

0.204044614381
0.182751623799
0.173378329918
