In [1]:
import re
import numpy as np
import pandas as pd

import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt

from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

import torch
from torch.utils.data import TensorDataset, DataLoader

In [2]:
file_name = 'data/tweets.csv'
headers = ['target', 'ids', 'date', 'flag', 'user', 'text']

lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')
stopwords_list = stopwords.words('english')

device =  torch.device('cpu' if not torch.cuda.is_available() else 'cuda:0') 
batch_size = 128
random_state = 123

print("Device : {}".format(device))

Device : cuda:0


In [3]:
def get_data(file_namet, down_sample=True):
    data = pd.read_csv(
                    file_name,
                    names=headers,
                    encoding='latin-1'
                    )
    
    data = shuffle(data)
    
    data = data[['target', 'text']]
    data['target'] = data['target'].astype(int)
    data = data.dropna()
    
    if down_sample:
        data = data.sample(
                        frac=0.01, 
                        replace=False, 
                        random_state=random_state
                        ) # Since I have Limited Resources I downsample the dataset heavily
    return data

def lemmatization(lemmatizer,sentence):
    lem = [lemmatizer.lemmatize(k) for k in sentence]
    return [k for k in lem if k]

def remove_stop_words(stopwords_list,sentence):
    return [k for k in sentence if k not in stopwords_list]

def preprocess_one(tweet):
    tweet = tweet.lower()
    remove_punc = tokenizer.tokenize(tweet) # Remove puntuations
    remove_num = [re.sub('[0-9]', '', i) for i in remove_punc] # Remove Numbers
    remove_num = [i for i in remove_num if len(i)>0] # Remove empty strings
    lemmatized = lemmatization(lemmatizer,remove_num) # Word Lemmatization
    remove_stop = remove_stop_words(stopwords_list,lemmatized) # remove stop words
    updated_tweet = ' '.join(remove_stop)
    return updated_tweet

def preprocessed_data(tweets):
    updated_tweets = []
    if isinstance(tweets, np.ndarray) or isinstance(tweets, list):
        for tweet in tweets:
            updated_tweet = preprocess_one(tweet)
            updated_tweets.append(updated_tweet)
    elif isinstance(tweets, np.str_)  or isinstance(tweets, str):
        updated_tweets = [preprocess_one(tweets)]

    return np.array(updated_tweets)

In [24]:
data = get_data(file_name)

X = data.text.values
Y = data.target.values

In [25]:
encoder = LabelEncoder()
encoder.fit(Y)

Y = encoder.transform(Y)
X = preprocessed_data(X)

In [26]:
X

array(['aunt watching wee cousin mum aunt looking one done bunk',
       'evan_wells well deserved live demo wa one highlight e point view keep great work',
       'rip david eddings', ..., 'hwy another part country',
       'bekakeb thank yaa bekkaa',
       'listening chemical romance dad bracelet got wet iranelection'],
      dtype='<U217')

In [27]:
def extract_vocabulary():
    vocabulary = {}
    for x in X:
        tokens = x.split(' ')
        for token in tokens:
            if token not in vocabulary:
                vocabulary[token] = 1
            else:
                vocabulary[token] += 1
                
    vocabulary = {k: v for k, v in sorted(vocabulary.items(), key=lambda item: item[1], reverse=True)}
    vocabulary = {k : idx+1 for idx, k in enumerate(vocabulary.keys())}
    return vocabulary

In [28]:
vocabulary = extract_vocabulary()

In [29]:
def extract_all_tokens(label):
    Xc = X[Y==label]
    Xc_tokens = []

    for xc in Xc:
        xc_tokens = xc.split(' ') 
        Xc_tokens.extend(xc_tokens)
        
    return Xc_tokens
        
def extract_frequencies():
    frequency_dict = {}

    Xpos_tokens = extract_all_tokens(1)
    Xneg_tokens = extract_all_tokens(0)
    
    print('Class-wise Tokens Extracted')
    
    for token in vocabulary.keys():
        frequency_dict[(token, 1)] = Xpos_tokens.count(token)
        frequency_dict[(token, 0)] = Xneg_tokens.count(token)
        
    return frequency_dict

In [30]:
frequency_dict = extract_frequencies()

Class-wise Tokens Extracted


In [31]:
def extract_features():
    X_features = torch.empty(len(X), 3, dtype=torch.float32)
    for idx, x in enumerate(X):
        x_tokens = x.split(' ') 
        pos_freq_sum = sum([frequency_dict[(token, 1)] for token in x_tokens])
        neg_freq_sum = sum([frequency_dict[(token, 0)] for token in x_tokens])
        x_features = torch.tensor([1.0, pos_freq_sum, neg_freq_sum])
        
        X_features[idx, :] = x_features
        
    return X_features

In [32]:
Xm = extract_features()
Ym = torch.from_numpy(Y)

In [34]:
Xm.shape, Ym.shape

(torch.Size([16000, 3]), torch.Size([16000]))

In [45]:
'''

    Logistic Regression Equation:
            
        P = sig(Xm * Θ)
        
        Xm.shape = 16000, 3
        Ym.shape = 16000
        
        Θ.shape = (3)

'''

def sigmoid(Z):
    return 1 / (1 + torch.exp(-Z))

def logistic_regression(Xbatch, Θ):
    return sigmoid(torch.matmul(Xbatch, Θ))

def gradient(Xbatch, Ybatch, Pbatch):
    return torch.mean(torch.matmul(Xbatch.T, Pbatch - Ybatch))
    
def weight_update(Θ, dΘ, alpha=0.01):
    Θ = Θ - alpha * dΘ
    return Θ

def loss(Xbatch, Ybatch, Pbatch):
    return torch.mean(torch.matmul(Xbatch.T, Pbatch - Ybatch))

In [47]:
Θ = torch.rand(3)
P = logistic_regression(Xm, Θ)

tensor(3539218.2500)