In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from nltk.tokenize import TweetTokenizer
from collections import defaultdict

In [2]:
dataset_path = 'sentiment_analysis.csv'
df = pd.read_csv(dataset_path, index_col = 'id')

In [3]:
def text_normalize(text):
    text = re.sub(r'^RT[\s]+', '', text)
    
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    
    text = re.sub(r'#', '', text)
    
    text = re.sub(r'[^\w\s]', '', text)
    
    tokenizer = TweetTokenizer(
        preserve_case=False,
        strip_handles=True,
        reduce_len=True)

    text_tokens = tokenizer.tokenize(text)
    return text_tokens

In [4]:
def get_freqs(df):
    freqs = defaultdict(lambda: 0)
    for idx, row in df.iterrows():
        tweet = row['tweet']
        label = row['label']
        
        tokens = text_normalize(tweet)
        for token in tokens:
            pair = (token, label)
            freqs[pair] += 1
            
    return freqs

In [5]:
def get_feature(text, freqs):
    tokens = text_normalize(text)
    
    X = np.zeros(3)
    X[0] = 1
    
    for token in tokens:
        X[1] += freqs[(token, 0)]
        X[2] += freqs[(token, 1)]
        
    return X

In [6]:
X = []
y = []

freqs = get_freqs(df)
for idx, row in df.iterrows():
    tweet = row['tweet']
    label = row['label']
    
    X_i = get_feature(tweet, freqs)
    X.append(X_i)
    y.append(label)
    
X = np.array(X)
y = np.array(y)

In [8]:
val_size = 0.2
test_size = 0.25
random_state = 2
is_shuffle = True

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size = val_size,
    random_state = random_state,
    shuffle = is_shuffle)

X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train,
    test_size = test_size,
    random_state = random_state,
    shuffle = is_shuffle)

In [9]:
normalizer = StandardScaler()
X_train[:, 1:] = normalizer.fit_transform(X_train[:, 1:])
X_val[:, 1:] = normalizer.transform(X_val[:, 1:])
X_test[:, 1:] = normalizer.transform(X_test[:, 1:])

In [10]:
def sigmoid(z):
    return 1 / ( 1 + np.exp(-z))

def compute_loss(y_hat, y):
    y_hat = np.clip(y_hat, 1e-7, 1-1e-7)
    return (-y * np.log(y_hat) - (1-y)*np.log(1-y_hat)).mean()

def predict(X, theta):
    dot_product = np.dot(X, theta)
    y_hat = sigmoid(dot_product)
    
    return y_hat

def compute_gradient(X, y, y_hat):
    return np.dot(X.T, y_hat-y) / y.size

def update_theta(theta, gradient, lr):
    return theta - lr*gradient

def compute_accuracy(X, y, theta):
    y_hat = predict(X, theta).round()
    acc = (y_hat == y).mean()
    
    return acc

In [11]:
lr = 0.01
epochs = 200
batch_size = 128

np.random.seed(random_state)
theta = np.random.uniform(size = X_train.shape[1])

In [12]:
val_set_acc = compute_accuracy(X_val, y_val, theta)
test_set_acc = compute_accuracy(X_test, y_test, theta)
print('Evaluation on validation and test set:')
print(f'Accuracy: {val_set_acc}')
print(f'Accuracy: {test_set_acc}')

Evaluation on validation and test set:
Accuracy: 0.48737373737373735
Accuracy: 0.5088383838383839


In [13]:
def predict(X, theta):
    z = np.dot(X, theta)
    
    return 1 / (1 + np.exp(-z))

X = [[22.3, -1.5, 1.1, 1]]
theta = [0.1, -0.15, 0.3, -0.2]
print(predict(X, theta))

[0.92988994]


In [14]:
def compute_loss(y_hat, y):
    y_hat = np.clip(y_hat, 1e-7, 1-1e-7)
    return (-y * np.log(y_hat) - (1-y)*np.log(1-y_hat)).mean()

y = np.array([1, 0, 0, 1])
y_hat = np.array([0.8, 0.75, 0.3, 0.95])
print(compute_loss(y_hat, y))

0.5043515376900958


In [17]:
def compute_gradient(X, y, y_hat):
    return np.dot(X.T, y_hat - y) / y.size

X = np.array([[1, 2], [2, 1], [1, 1], [2, 2]])
y_true = np.array([0, 1, 0, 1])
y_pred = np.array([0.25, 0.75, 0.4, 0.8])

print(compute_gradient(X, y_true, y_pred))

[-0.0625  0.0625]


In [19]:
def compute_accuracy(y_true, y_pred):
    y_pred_rounded = np.round(y_pred)
    acc = np.mean(y_true == y_pred_rounded)
    
    return acc

y_true = [1, 0, 1, 1]
y_pred = [0.85, 0.35, 0.9, 0.75]
print(compute_accuracy(y_true, y_pred))

1.0


In [20]:
def compute_gradient(X, y, y_hat):
    return np.dot(X.T, y_hat - y) / y.size

X = np.array([[1, 3], [2, 1], [3, 2], [1, 2]])
y_true = np.array([1, 0, 1, 1])
y_pred = np.array([0.7, 0.4, 0.6, 0.85])
print(compute_gradient(X, y_true, y_pred))

[-0.2125 -0.4   ]
