In [1]:
import gzip
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# read the data
with gzip.open("data/labelled_movie_reviews.csv.gz", "rt") as f:
    df = pd.read_csv(f)

In [3]:
# shuffle the rows
# NOTE: we will only use a fraction of the dataset for this lab
#       so that you don't have to wait so long for the model to train
df = df.sample(frac=0.3, random_state=123).reset_index(drop=True)

In [4]:
# convert pandas series to lists
Xr = df["text"].tolist()
Yr = df["label"].tolist()

# compute the train, val, test splits
train_frac, val_frac, test_frac = 0.7, 0.1, 0.2
train_end = int(train_frac*len(Xr))
val_end = int((train_frac + val_frac)*len(Xr))

# store the train val test splits
X_train = Xr[0:train_end]
Y_train = Yr[0:train_end]
X_val = Xr[train_end:val_end]
Y_val = Yr[train_end:val_end]
X_test = Xr[val_end:]
Y_test = Yr[val_end:]

In [29]:
# fit a linear model
# Xtr - A list of training documents provided as text
# Ytr - A list of training class labels
# C - the regularization parameter
print(set(Yr))
def fit_model(Xtr, Ytr, C):
    # TODO: write model fitting code using CountVectorizer and LogisticRegression
    #       CountVectorizer is used to convert the text into sparse TF vectors
    #       See https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
    #       LogisticRegression will train the classifier using these vectors
    #       See https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
    #init vectorizer
    count_vec = CountVectorizer()
    X = count_vec.fit_transform(Xtr)
    
    #Logistic Regression
    model = LogisticRegression(C=C[0])
    model.fit(X.toarray(), Ytr)
    # return the model and CountVectorizer
    # Note: we need to return the CountVectorizer because 
    # it stores a mapping from words -> ids which we need for testing
    return model, count_vec


{'pos', 'neg'}


In [23]:
# test a fitted linear model
# Xtst - A list of test or validation documents
# Ytst - A list of test or validation class labels
def test_model(Xtst, Ytst, model, count_vec):
    # TODO: write code to test a fitted linear model and return accuracy
    #       you will need to use count_vec to convert the text into TF vectors
    # Hint: the function accuracy_score from sklearn may be helpful
    #       See ttps://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html 
    Xt = count_vec.transform(Xtst)
    
    score = accuracy_score(Ytst, model.predict(Xt.toarray()))
    
    return score

In [41]:
# TODO: search for the best C parameter by 
#       training on the training set and testing on the validation set
#       you should use fit_model and test_model
def obj_func(C, Xtst, Ytst, Xtr, Ytr):
    model, count_vec = fit_model(Xtr, Ytr, C)
    return -test_model(Xtst, Ytst, model, count_vec) #negative to maximize
#Special Imports for this lab

from scipy.optimize import minimize
import tqdm.notebook as tq

def Search(num_iters, Xtr, Ytr, Xtst, Ytst, Xval, Yval, solver="L-BFGS-B", terminate=0.1):
    """
    Optimize (in this case the C value of fitting logistic regressions, could be made to generalize)
    Args:
        1) num_iters: Number of iterations of minimize
        2) Xtr-Yval: X and Y train, test, val sets
        3) solver: optimization technique for minimize
        4) terminate: loop will break if the rolling average of the step size is less than terminate
    """

    #Def initial Vars
    graph_help = dict()
    initial_x = [2.0] #initialize the C value (X_0), We initialize at the default value for logistic regression fit
    args = (Xtst, Ytst, Xtr, Ytr) #arguments for objective function
    Bounds = [(0.001, 20)] #Provide some search bound for the function (Hyperparameter in itself, but we'll see)
    minval = [initial_x, obj_func(initial_x, *args)] #Record minimum value
    #Early Termination
    Step = [float("inf"), float("inf"), float("inf")]#Track rolling average of step sizes (window 3)
    Roll = [] #Keep track of C values, need to keep of size 4 Step=[x1-x2, x2-x3, x3-x4]

    #Start optimization
    for i in tq.tqdm(range(num_iters)): #loop
        if i == 4: #Initialize step
            Step = [abs(Roll[1]-Roll[0]), abs(Roll[2]-Roll[1]), abs(Roll[3]-Roll[2])]
        MinOBJ = minimize(obj_func, x0 = minval[0], args=args, method=solver, bounds=Bounds)
        Value = obj_func(MinOBJ['x'], *args)
        if MinOBJ['x'][0] < minval[0][0]:
            minval = [MinOBJ['x'][0], Value] #update minval to continue optimizing
        #Implement Rolling Average for step
        if i < 4:
            Roll.append(MinOBJ['x'][0])
        else:
            if np.mean(Step) <= terminate:
                break
            else:
                Roll = Roll[1:].append(MinOBJ['x'][0]) #Update roll
                Step = Step[1:].append(abs(Roll[-1] - Roll[-2])) #update Step
        print(MinOBJ['x'][0], minval[0][0])
        graph_help[MinOBJ['x'][0]] = Value #Graph functionality (Maybe useful)
    
    #Print metrics, build graph metrics (If i feel like it)
    print("Current Best C: %.2f\n Value at C: %.2f\n", minval[0], -minval[1]) #Reverse the negative to get accuracy
    print("Get Validation Metrics\n")
    modelO, count_vecO = fit_model(Xtr, Ytr, minval[0]) #optimized
    model0, count_vec0 = fit_model(Xtr, Ytr, initial_x) #default for comparison
    OptimizedResult = test_model(Xval, Yval, modelO, count_vecO) #Calc accuracy
    BaseResult = test_model(Xval, Yval, model0, count_vec0)
    print("Optimized Accuracy: %.2f, Base (Default C Value) Accuracy: %.2f\n", OptimizedResult, BaseResult)
    print("done")


Search(100, X_train, Y_train, X_test, Y_test, X_val, Y_val)

In [None]:
# TODO: fit the model to the concatenated training and validation set
#       test on the test set and print the result

In [None]:
# TODO: find the words corresponding to the 5 largest (most positive) and 
#       5 smallest (most negative) co-efficients of the linear model
# Hint: a fitted LogisticRegression model in sklearn has a coef_ attribute which stores the co-efficients
#       CountVectorizer has a vocabulary_ attribute that stores a mapping of terms to feature indices