## Creating a logistic regression model for sentiment analysis

Borrowed heavily (almost entirely from) https://kavita-ganesan.com/news-classifier-with-logistic-regression-in-python/#.Yui_xXbMKUl

Code also on git at https://github.com/kavgan/nlp-in-practice/blob/2d9e23c1d8ab56e9533be188c9ce7a0f6efc11e1/text-classification/notebooks/Text%20Classification%20with%20Logistic%20Regression.ipynb

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
reviews = pd.read_csv("comments_preproc.csv", index_col=0)#.sample(n=50000, random_state=0)
reviews.reset_index(drop=True, inplace=True)

reviews.head()

Unnamed: 0,firstName,lastName,comment,clarityRating,cleaned comment,sentiment
0,Marty,Beans,Very nice and understanding. A lot of homework...,4,nice understand lot homework grade problem num...,1
1,Marty,Beans,She is very helpful. Gives EC if you go to tut...,4,"helpful ec tutor poor office hour , willing he...",1
2,Marty,Beans,she was nice. good job her,4,nice good job,1
3,Marty,Beans,Professor Beans is one of the best Math teache...,5,professor bean good math teacher come absolute...,1
4,Marty,Beans,Big smile. Big goals. Nice lady. Bright and ch...,5,big smile big goal nice lady bright cheery swe...,1


#### Interestingly, some comments are entirely empty after being sampled

They must have been entirely consistent of stop words that just got removed lmao

In [3]:
reviews["sentiment"] = reviews["clarityRating"].apply(lambda x: 1 if x > 3 else 0 if x == 3 else -1)

In [4]:
reviews["cleaned comment"].isna().sum()

242

In [5]:
reviews.dropna(subset=["cleaned comment"], inplace=True)

Next few cells are function definitions

In [6]:
# Extract features using different available methods

def extract_features(df, train_data, test_data, type="binary"):
    if "binary" in type:
        # binary feature representation
        
        cv = CountVectorizer(binary=True, max_df=0.95)
        cv.fit_transform(train_data["cleaned comment"].values)

        train_features = cv.transform(train_data["cleaned comment"].values)
        test_features = cv.transform(test_data["cleaned comment"].values)

        return train_features, test_features, cv
    
    elif "counts" in type:
        # count-based feature representation

        cv = CountVectorizer(binary=False, max_df=0.95)
        cv.fit_transform(train_data["cleaned comment"].values)

        train_features = cv.transform(train_data["cleaned comment"].values)
        test_features = cv.transform(test_data["cleaned comment"].values)

        return train_features, test_features, cv
    
    else:
        # TF-IDF based feature representation

        tfidf_vec = TfidfVectorizer(use_idf=True, max_df=0.95)
        tfidf_vec.fit_transform(train_data["cleaned comment"].values)

        train_features = tfidf_vec.transform(train_data["cleaned comment"].values)
        test_features = tfidf_vec.transform(test_data["cleaned comment"].values)

        return train_features, test_features, tfidf_vec

In [7]:
def get_top_k_predictions(model, X_test, k):
    probs = model.predict_proba(X_test)         # get probabilities instead of labels
    best_n = np.argsort(probs, axis=1)[:,-k:]   # get top k predictions by index (note: just index)

    preds = [[model.classes_[predicted_cat] for predicted_cat in prediction] for prediction in best_n]          # get category of predictions
    preds = [item[::-1] for item in preds]     # reverse categories, descending order of importance

    return preds

In [8]:
def collect_preds(Y_test, Y_preds):
    pred_gold_list=[[[Y_test[idx]],pred] for idx,pred in enumerate(Y_preds)]
    return pred_gold_list

In [9]:
def compute_accuracy(eval_items:list):
    correct = 0
    total = 0

    for item in eval_items:
        true_pred = item[0]
        machine_pred = set(item[1])

        for cat in true_pred:
            if cat in machine_pred:
                correct += 1
    accuracy = correct/float(len(eval_items))
    return accuracy

In [10]:
def _reciprocal_rank(true_labels:list, machine_preds:list):
    tp_pos_list = [(idx + 1) for idx, r in enumerate(machine_preds) if r in true_labels]
    
    rr = 0
    if len(tp_pos_list) > 0:
        first_pos_list = tp_pos_list[0]
        rr = 1 / float(first_pos_list)
    
    return rr

In [11]:
# compute mean reciprocal rank, which I understand as follows
""" this is admittedly much more useful when dealing with multiple categories
essentially, how many of the relevant categories appear in the top k predicted categories (or something to that effect, as it's typically shown as a percentage)
as our data currently has two possible categories, if we let top_k=2, everything comes out to 100% lol, and if top_k=1, accuracy and mrr are the same
"""

def compute_mrr_at_k(items:list):
    rr_total = 0

    for item in items:
        rr_at_k = _reciprocal_rank(item[0], item[1])
        rr_total += rr_at_k
        mrr = rr_total / 1/float(len(items))
    
    return mrr

In [12]:
def train_model(df, field="cleaned comments", feature_rep="binary", top_k=1):
    train_data, test_data = train_test_split(df, random_state=0)            # get train-test split
    y_train = train_data["sentiment"].values                                # isolate labels in training and testing data
    y_test = test_data["sentiment"].values

    X_train, X_test, feature_transformer = extract_features(reviews, train_data, test_data, type=feature_rep)           # get features

    log_reg = LogisticRegression(verbose=1, solver="liblinear", random_state=0, C=5, penalty="l2", max_iter=1000)       # create model and fit to training data
    model = log_reg.fit(X_train, y_train)

    preds = get_top_k_predictions(model, X_test, top_k)                 # get k most relevant predictions

    eval_items = collect_preds(y_test, preds)                           # get predicted values and ground into list of lists (for ease of evaluation)

    accuracy = compute_accuracy(eval_items)                             # get final stats on success rate of model
    mrr_at_k = compute_mrr_at_k(eval_items)

    return model, feature_transformer, accuracy, mrr_at_k

*Finally ready to start actually using the model*

In [13]:
feature = "binary"
top_k = 1

model, transformer, accuracy, mrr = train_model(reviews, "cleaned comments", feature_rep=feature, top_k=top_k)
print("\n*** USING BINARY FEATURE REPRESENTATION ***")
print("Accuracy={0}; MRR={1}".format(accuracy,mrr))

[LibLinear]
*** USING BINARY FEATURE REPRESENTATION ***
Accuracy=0.7714593695854202; MRR=0.7714593695854202


In [14]:
feature = "count"
top_k = 1

model, transformer, accuracy, mrr = train_model(reviews, "cleaned comments", feature_rep=feature, top_k=top_k)
print("\n*** USING COUNT-BASED FEATURE REPRESENTATION ***")
print("Accuracy={0}; MRR={1}".format(accuracy,mrr))

[LibLinear]

In [None]:
feature = "tfidf"
top_k = 1

model, transformer, accuracy, mrr = train_model(reviews, "cleaned comments", feature_rep=feature, top_k=top_k)
print("\n*** USING TF-IDF FEATURE REPRESENTATION ***")
print("Accuracy={0}; MRR={1}".format(accuracy,mrr))