Ridge Regularization using cuda

# Regularization – Ridge Regression using LSE Fit

1. Generating Dataset

In [1]:
import cudf as cd
import cupy as cp
import math
import sys
from cuml.model_selection import train_test_split
from cuml.preprocessing import StandardScaler, PolynomialFeatures
from cuml import metrics

In [2]:
angles=[]
sin_val=[]
for i in range(60,360,4):
    angles.append(i)
    sin_val.append(math.sin(math.radians(i)))

df=cd.DataFrame({'x': angles, 'y': sin_val})
df

Unnamed: 0,x,y
0,60,0.866025
1,64,0.898794
2,68,0.927184
3,72,0.951057
4,76,0.970296
...,...,...
70,340,-0.342020
71,344,-0.275637
72,348,-0.207912
73,352,-0.139173


2. Split the dataset in train and test; fit a ridge regression using least square error fit on the
train set using a particular value of regularization parameter (λ).

In [3]:
x=df.iloc[:,0]
y=df.iloc[:,1]

x=StandardScaler().fit_transform(cd.DataFrame(x))
x=cd.DataFrame(x)
x.insert(loc=0,value=1.0,name='ones')

# Dataframe to cupy array
x=cp.fromDlpack(x.to_dlpack())
y=cp.fromDlpack(y.to_dlpack())

In [4]:
x_train1,x_test1,y_train1,y_test1=train_test_split(x,y,test_size=0.15,random_state=42)

In [5]:
#  function to calulate score for different values of lambda
def cal_score(lam,x_train,x_test,y_train,y_test):
    xt=x_train.T
    Inv=cp.linalg.inv(xt.dot(x_train)+lam*(cp.identity(2)))
    beta=Inv.dot(xt).dot(y_train)

    y_predict=x_test.dot(beta)+lam*(beta.T.dot(beta))
    score=metrics.r2_score(y_test,y_predict)
    return beta,score

In [14]:
beta,score=cal_score(0.001,x_train1,x_test1,y_train1,y_test1)
print(beta)
print(score)

[-0.11660104 -0.63937615]
0.6652279681042539


3. Tune the value of λ using concept of validation sets i.e., Divide the train set further into
train and validation sets and consider different values of λ. For each value of λ, fit a ridge
regressor (using LSE) on train set and test its performance on validation set. Choose the
value of λ which gave best performance on validation set.

In [15]:
x_train2,x_test2,y_train2,y_test2=train_test_split(x_train1,y_train1,test_size=0.15,random_state=42)

In [16]:
max_val= -sys.maxsize - 1
for i in [0.00001,0.0001,0.001,0.01]:
    beta,score=cal_score(i,x_train2,x_test2,y_train2,y_test2)
    if(max_val<score):
        max_val=score
        best_lam=i
    print("Accuracy for lamda",i," :",score)
print("Best Lambda:",best_lam)

Accuracy for lamda 1e-05  : 0.7622087354063717
Accuracy for lamda 0.0001  : 0.762208397715225
Accuracy for lamda 0.001  : 0.7622050206874972
Accuracy for lamda 0.01  : 0.762171238789747
Best Lambda: 1e-05


4. For the best value of λ, train the ridge regressor on the entire training set (train + validation
set) and test the performance on test set.

In [17]:
beta,score=cal_score(best_lam,x_train1,x_test1,y_train1,y_test1)
score

0.6652203473476308

5. Inbuilt function for Ridge Regression in cuml to fit and predict the values of output variable using the best value of λ.

In [19]:
from cuml.linear_model import  Ridge

regressor=Ridge()
regressor.fit(x_train1,y_train1)
regressor.score(x_test1,y_test1)
regressor.predict(x_test1)

array([ 5.51499686e-01, -8.42896020e-01, -4.48614486e-04, -1.07529530e+00,
        2.02900759e-01,  8.12948881e-01,  9.00098612e-01, -8.71945931e-01,
        5.80549596e-01, -2.61897809e-01, -1.10434522e+00])

In [20]:
cd.Series(regressor.coef_)

0    0.000000
1   -0.628893
dtype: float64

# Based on PPMI matrix

In [2]:
import cudf as cd
import cupy as cp

movie_df=cd.read_csv('IMDB Dataset.csv',nrows=1000)
movie_df.shape

(1000, 2)

1. Creating corpus of first 1000 reviews

In [3]:
corp=[]
for i in range(len(movie_df)):
    corp.append(movie_df['review'][i])  
corpus=[]
for i in range(len(corp)):
    corpus.append(" ".join([word.lower() for word in corp[i].split() if word.isalpha()]))

2. Convert the corpus into binary BOW vector of size mXn, where m=1000 (number of reviews documents) and n is the number of unique terms obtained from the 1000 documents. Each ijth entry of the vector is a binary value which is 1 if the jth term is present in ith review else 0. (Without using in-built function)

In [4]:
doc_freq={}
for i in range(len(corpus)):
    word_freq={}
    for word in corpus[i].split():
        word_freq[word]=1
    doc_freq[i]=word_freq

import pandas as pd
bow=pd.DataFrame(doc_freq)
bow.fillna(0,inplace=True)
bow

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
one,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
of,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
the,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
other,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
reviewers,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
offices,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
intuition,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
stubborn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
masking,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


3. Compute the co-occurrence matrix of order nXn where each ijth entry of matrix is number of documents in which both ith and jth terms co-occur. (Use binary BOW vector to compute it).

In [5]:
cooccurence_mat=bow.values.dot(bow.values.T)
cp.fill_diagonal(cooccurence_mat,0)
cooccurence_mat

array([[  0., 496., 500., ...,   1.,   1.,   1.],
       [496.,   0., 950., ...,   1.,   1.,   1.],
       [500., 950.,   0., ...,   1.,   1.,   1.],
       ...,
       [  1.,   1.,   1., ...,   0.,   1.,   1.],
       [  1.,   1.,   1., ...,   1.,   0.,   1.],
       [  1.,   1.,   1., ...,   1.,   1.,   0.]])

4. Compute PPMI matrix where PPMI between two words a and b:  where n(a,b) is the number of documents in which both words a and b co-occur (from co-occurrence matrix), n(a) and n(b) is number of documents in which terms a and b occur respectively (from BOW vector); |D| is total number of documents (=1000 in this case).

In [None]:
ppmi_mat = np.zeros(cooccurence_mat.shape)
for i in  range(14719):
    for j in range(14719):
        if(i!=j):
            x  = cooccurence_mat[i][j]/(total_word_freq[i]*total_word_freq[j])
            y = x*1000   
            if(y==0):
                ppmi_mat[i][j] = 0
            else: 
                z = math.log(y)
                ppmi = max(z,0)
                ppmi_mat[i][j] = ppmi