# Cross validation with chosen alpha

### Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold

### Function to add a bias term to dataset

In [2]:
def append_bias_term(a):
    
    b = np.ones((a.shape[0],1))
    a = np.hstack((b, a))
    
    return a

### Function to calculate normalization parameters

In [3]:
def n_params(a):
    
    mean = np.mean(a, axis=0)
    standard_dev = np.std(a, axis=0)
    
    return mean, standard_dev

### Function to perform normalization on dataset

In [4]:
def n_feat(a, mean, standard_dev):
    
    a = (a - mean) / (standard_dev + 1e-8)
    
    return a

### Logistic Regression model

In [5]:
def h(a, Theta):
    
    b = 1.0/(1.0 + np.exp(-np.dot(a, Theta)))
    
    return b

In [6]:
def prediction_func(a, Theta, h, thresh=0.50):
    
    b = h(a, Theta)
    b = (b>thresh) * 1
    
    return b

### Batch gradient descent learning

In [7]:
def param_learn(a, b, theta, h, alpha, iter_max=300):
    iteration = 1
    J_store = []
    num = a.shape[0]
    
    while True:
        
        error = (h(a, theta) - b)
        J_store.append(1.0/(2*num) * np.sum(error**2))
        a_error = (a * error)
        theta = theta - alpha * np.mean(a_error, axis=0)[:,np.newaxis]
        
        if iteration > iter_max:
            break
        iteration += 1
        
    return theta, J_store

### Read in preprocessed dataset

In [8]:
dataset = pd.read_csv('Pre-processed_Sarcasm_Headlines_Dataset.csv')

### Apply vectorizer to train and test data

In [9]:
vectorizer = TfidfVectorizer(min_df = 5, max_df = 0.8, sublinear_tf = True, analyzer='word',smooth_idf=True,use_idf=True)
X_1 = vectorizer.fit_transform(dataset['headline'])

### Convert split data to dataframes and 2-D arrays

In [10]:
X_1 = pd.DataFrame(X_1.toarray())
print(X_1.shape)

y_1 = dataset['is_sarcastic']
y_1 = y_1[:, np.newaxis]

(26709, 5534)


### Normalize training and testing data

In [11]:
mean, standard_dev = n_params(X_1)
x_1_norm = n_feat(X_1, mean, standard_dev)

### Appending bias term to data

In [12]:
x_1_norm_aug = append_bias_term(x_1_norm)

### Create storage for error rates

In [13]:
store_error_train = []
store_error_test = []

### Perform K-fold cross validation with chosen alpha

In [14]:
fold=1
alpha=0.01
cv = KFold(n_splits=5, random_state=100, shuffle=False)
for train_index, test_index in cv.split(x_1_norm_aug):
    
    print("Train Index: ", train_index, "\n")
    print("Test Index: ", test_index)

    X_train, X_test, y_train, y_test = x_1_norm_aug[train_index], x_1_norm_aug[test_index], y_1[train_index], y_1[test_index]
    
    initial_thetas = np.zeros((X_train.shape[1],1))
    learnt_thetas, J = param_learn(X_train, y_train, initial_thetas, h, alpha=alpha)
    
    train_predict = prediction_func(X_train, learnt_thetas, h)
    test_predict = prediction_func(X_test, learnt_thetas, h)
    
    error_train = np.sum(y_train != train_predict) / train_predict.shape[0]
    error_test = np.sum(y_test != test_predict) / test_predict.shape[0]
    
    store_error_train.append(error_train)
    store_error_test.append(error_test)
    
    print('Fold: {}'.format(fold))
    print('emprical error training(in %) = {:.3f}'.format(error_train*100))
    print('emprical error testing(in %) = {:.3f}'.format(error_test*100))
    fold+=1



Train Index:  [ 5342  5343  5344 ... 26706 26707 26708] 

Test Index:  [   0    1    2 ... 5339 5340 5341]
Fold: 1
emprical error training(in %) = 10.072
emprical error testing(in %) = 17.372
Train Index:  [    0     1     2 ... 26706 26707 26708] 

Test Index:  [ 5342  5343  5344 ... 10681 10682 10683]
Fold: 2
emprical error training(in %) = 9.945
emprical error testing(in %) = 16.979
Train Index:  [    0     1     2 ... 26706 26707 26708] 

Test Index:  [10684 10685 10686 ... 16023 16024 16025]
Fold: 3
emprical error training(in %) = 10.268
emprical error testing(in %) = 16.305
Train Index:  [    0     1     2 ... 26706 26707 26708] 

Test Index:  [16026 16027 16028 ... 21365 21366 21367]
Fold: 4
emprical error training(in %) = 10.226
emprical error testing(in %) = 16.698
Train Index:  [    0     1     2 ... 21365 21366 21367] 

Test Index:  [21368 21369 21370 ... 26706 26707 26708]
Fold: 5
emprical error training(in %) = 9.832
emprical error testing(in %) = 17.562


### Print average of error percentages

In [15]:
print('Cross validated training error: ')
print(np.array(store_error_train).mean()*100)

print('Cross validated testing error: ')
print(np.array(store_error_test).mean()*100)

Cross validated training error: 
10.068705644594441
Cross validated testing error: 
16.983061110230604
