In [1]:
#%config Completer.use_jedi = False
# Jupyter notebook autocomplete was not working.

# !python -m spacy download en_core_web_trf
# Code to download the specific spacy core model or trf or whatever

import pandas as pd
import numpy as np
    
import re
import string
import torch
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords

import spacy
import spacy_transformers
from spacy.util import minibatch, compounding
from spacy.training import Example
# from spacy.gold import GoldParse # For use in nlp.update
# from spacy.training.example import Example

In [2]:
train = pd.read_csv('Data/twitter_training.csv', header = None)
train.columns = ['Id', 'Entity', 'Sentiment', 'Content']
train.head()

Unnamed: 0,Id,Entity,Sentiment,Content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [3]:
validate = pd.read_csv('Data/twitter_validation.csv', header = None)
validate.columns = ['Id', 'Entity', 'Sentiment', 'Content']
validate.head()

Unnamed: 0,Id,Entity,Sentiment,Content
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [4]:
train.Sentiment.value_counts()

Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: Sentiment, dtype: int64

In [5]:
train.Sentiment.value_counts(normalize = True)

Negative      0.301840
Positive      0.278943
Neutral       0.245280
Irrelevant    0.173937
Name: Sentiment, dtype: float64

In [6]:
train.Entity.value_counts()

Microsoft                            2400
TomClancysRainbowSix                 2400
MaddenNFL                            2400
LeagueOfLegends                      2394
CallOfDuty                           2394
Verizon                              2382
CallOfDutyBlackopsColdWar            2376
ApexLegends                          2376
Facebook                             2370
Dota2                                2364
WorldOfCraft                         2364
NBA2K                                2352
TomClancysGhostRecon                 2346
Battlefield                          2346
FIFA                                 2340
Xbox(Xseries)                        2334
Overwatch                            2334
johnson&johnson                      2328
Amazon                               2316
PlayStation5(PS5)                    2310
HomeDepot                            2310
Cyberpunk2077                        2304
CS-GO                                2304
GrandTheftAuto(GTA)               

In [7]:
# Should we create new models for each Entity?
# Lets see if there are any entities in the train and/or test that are not in the other set.
entity_values = list(set(train.Entity.values.tolist()))
validation_values = list(set(validate.Entity.values.tolist()))

def diff_of_two_lists(list1 : list, list2 : list) -> list:
    return list(set(list1) - set(list2))

in_train_not_validate = diff_of_two_lists(entity_values, validation_values)
in_validate_not_train = diff_of_two_lists(validation_values, entity_values)

print(f'Diff of training and validate: {in_train_not_validate}')
print(f'Diff of validate and training: {in_validate_not_train}')

Diff of training and validate: []
Diff of validate and training: []


# Define Broad Functions

In [8]:
def remove_special_chars(text : str) -> str:
    '''This will remove special characters from a string'''
    return re.sub('[^A-Za-z0-9]+', '', text)

def clean_text(text : str) -> str:
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower() # make everythng lowercase
    text = re.sub('\[.*?\]', '', text) # remove text in squre brackets
    text = re.sub('https?://\S+|www\.\S+', '', text) # remove website links
    text = re.sub('<.*?>+', '', text) # remove anything within <...>
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # remove punctuation
    text = re.sub('\n', '', text) # remove new line 
    text = re.sub('\w*\d\w*', '', text) # remove words containing numbers
    
    return text

def text_preprocessing(text : str, remove_duplicate : bool = False, remove_stop_words : bool = False) -> str:
    '''This takes a string, implements clean_text, and tokenizes the workds into a list.
    This can also remove duplicated words, but also remove stop words.'''  
    
    ### TODO: Update this to be all in the Spacy framework, i.e. drop NLTK
    
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    nopunc = clean_text(text)
    tokenized_text = tokenizer.tokenize(nopunc)
    
    # If remove_duplicates is used, order is lost
    if remove_duplicate: 
        tokenized_text = list(set(tokenized_text))
    
    ### Where (if at all) should we remove the stop words, i.e., before we clean the text?  Dont and don't are diff.
    if remove_stop_words:
        tokenized_text = [w for w in tokenized_text if w not in stopwords.words('english')]
    
    combined_text = ' '.join(tokenized_text)
    
    return combined_text

# Define Slightly More Specific Functions

In [9]:
def process_tweets(data : pd.DataFrame) -> pd.DataFrame:
    data['Content_Clean'] = data['Content'].apply(str).apply(lambda x : text_preprocessing(x))
    data['Entity_Clean'] = data['Entity'].apply(str).apply(lambda x : text_preprocessing(x))
    data['Sentiment'] = data['Sentiment'].apply(str).apply(lambda x : x.upper())
    
    data['cats'] = [{
            "POSITIVE" : sentiment == "POSITIVE",
            "NEUTRAL" : sentiment == "NEUTRAL",
            "NEGATIVE" : sentiment == "NEGATIVE",
            "IRRELEVANT" : sentiment == "IRRELEVANT",
        } 
        for sentiment in data.Sentiment
    ]
    
    return data

#  This needs to be vetted and updated to our scenario.
def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0   # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0   # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if label == "NEGATIVE":
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}


# High Level Analysis

In [10]:
train_count = train.\
    groupby(['Entity', 'Sentiment']).count()['Content'].reset_index().\
    sort_values(by = ['Entity', 'Sentiment'], ascending = True)

train_count_df = pd.DataFrame(columns = ['Positive', 'Negative', 'Neutral', 'Irrelevant'])

for _, row in train_count.iterrows():
    train_count_df.loc[row.Entity, row.Sentiment] = row.Content

train_count_pct_df = train_count_df.div(train_count_df.sum(axis = 1), axis = 0)

train_count_pct_df.head()

Unnamed: 0,Positive,Negative,Neutral,Irrelevant
Amazon,0.135325,0.252636,0.530316,0.081722
ApexLegends,0.269443,0.251169,0.39779,0.081598
AssassinsCreed,0.644136,0.16786,0.06983,0.118174
Battlefield,0.253022,0.200345,0.151554,0.395078
Borderlands,0.446053,0.186842,0.261842,0.105263


In [11]:
train_count_pct_df.loc[train_count_pct_df.idxmax().values, ]

Unnamed: 0,Positive,Negative,Neutral,Irrelevant
AssassinsCreed,0.644136,0.16786,0.06983,0.118174
MaddenNFL,0.166597,0.712663,0.082878,0.037863
Amazon,0.135325,0.252636,0.530316,0.081722
PlayerUnknownsBattlegrounds(PUBG),0.178156,0.303491,0.117726,0.400627


In [12]:
train_avg = train_count_pct_df.apply(np.mean, axis = 0)
train_std = train_count_pct_df.apply(np.std, axis = 0)

for idx, value in train_avg.iteritems():
    print(f'Sentiment {idx} has a mean of {value:.3f} with a standard deviation of {train_std[idx]:.3f}')

Sentiment Positive has a mean of 0.280 with a standard deviation of 0.113
Sentiment Negative has a mean of 0.301 with a standard deviation of 0.135
Sentiment Neutral has a mean of 0.244 with a standard deviation of 0.125
Sentiment Irrelevant has a mean of 0.175 with a standard deviation of 0.113


At this point, we know that each Entity has approximately the same number of samples (2400), but the distribution between Positive, Negative, Neutral, and Irrelevant can be very different.  I suspect that we will need multiple models for each Entity, but I am going to try a few things.

I am going to start with simple, one model for all of them.  I am going to try and think of a way to use the Entity as an additionl parameter, but will probably end up moving to a model for each entity, which seems inefficient, so additional considerations might be needed.

# Cleaning and Preprocessing

Here we are going to simply apply the text cleaning logic that we established above.  We obviously need to do this for the Content text itself, but I also want to do it for the Entity.  Eventually, I expect that we will need to create separate models for each entity and want the model names to be simple and consistent.

In [13]:
train_clean = process_tweets(train)
validate_clean = process_tweets(validate)

train_clean.head()

Unnamed: 0,Id,Entity,Sentiment,Content,Content_Clean,Entity_Clean,cats
0,2401,Borderlands,POSITIVE,im getting on borderlands and i will murder yo...,im getting on borderlands and i will murder yo...,borderlands,"{'POSITIVE': True, 'NEUTRAL': False, 'NEGATIVE..."
1,2401,Borderlands,POSITIVE,I am coming to the borders and I will kill you...,i am coming to the borders and i will kill you...,borderlands,"{'POSITIVE': True, 'NEUTRAL': False, 'NEGATIVE..."
2,2401,Borderlands,POSITIVE,im getting on borderlands and i will kill you ...,im getting on borderlands and i will kill you all,borderlands,"{'POSITIVE': True, 'NEUTRAL': False, 'NEGATIVE..."
3,2401,Borderlands,POSITIVE,im coming on borderlands and i will murder you...,im coming on borderlands and i will murder you...,borderlands,"{'POSITIVE': True, 'NEUTRAL': False, 'NEGATIVE..."
4,2401,Borderlands,POSITIVE,im getting on borderlands 2 and i will murder ...,im getting on borderlands and i will murder yo...,borderlands,"{'POSITIVE': True, 'NEUTRAL': False, 'NEGATIVE..."


In [14]:
train_F = list(zip(train_clean.Content_Clean, train_clean.cats))
validate_F = list(zip(validate_clean.Content_Clean, validate_clean.cats))

train_F[5:7]

[('im getting into borderlands and i can murder you all',
  {'POSITIVE': True,
   'NEUTRAL': False,
   'NEGATIVE': False,
   'IRRELEVANT': False}),
 ('so i spent a few hours making something for fun if you dont know i am a huge borderlands fan and maya is one of my favorite characters so i decided to make myself a wallpaper for my pc here is the original image versus the creation i made enjoy',
  {'POSITIVE': True,
   'NEUTRAL': False,
   'NEGATIVE': False,
   'IRRELEVANT': False})]

# Very Basic, Singular Model

This actually became a bit more complicated than expected.  Typically in sentiment analysis, we can do a model that has zero for negative and one for positive.  The range between them can be seen as the "confidence" in the model for that specific input.  In this case, however, we have FOUR possibilities, so we need to have something a bit different -- custom text classifications.

https://www.machinelearningplus.com/nlp/custom-text-classification-spacy/

In [17]:
# https://stackoverflow.com/questions/62486950/spacy-training-model
# I am going to train this on an existing model, next time we can try a blank model.

# nlp = spacy.load('en_core_web_trf')
nlp = spacy.blank("en")
textcat = nlp.add_pipe("textcat")

sentiments = train_clean.Sentiment.unique().tolist()
for sentiment in sentiments:
    textcat.add_label(sentiment)

n_iter = 5
examples = []

for text, labels in train_F:
    examples.append(Example.from_dict(nlp.make_doc(text), {"cats": {"sentiment": labels}}))
    
#nlp.initialize(lambda : examples)

#other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
#with nlp.disable_pipes(*other_pipes):    
with nlp.select_pipes(enable = "textcat"):
    for i in tqdm(range(n_iter)):
        for batch in minibatch(examples, size = 8):
            # Does not work, hitting: Cannot get dimension 'nO' for model 'sparse_linear': value unset
            # https://github.com/explosion/spaCy/discussions/6485
            # https://stackoverflow.com/questions/68500136/training-a-basic-spacy-text-classification-model
            nlp.update(batch)

  0%|          | 0/5 [00:00<?, ?it/s]


ValueError: Cannot get dimension 'nO' for model 'sparse_linear': value unset

In [17]:
nlp.pipe_names

['transformer',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'textcat']

In [None]:
# maybe try this:
# https://www.kaggle.com/cameronwatts/twitter-sentiment-classification
# https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html

#https://medium.com/analytics-vidhya/building-a-text-classifier-with-spacy-3-0-dd16e9979a

# here we go