In [None]:
#@title Auxiliar Functions and Dependencies ⚠️
#@markdown ⚡ Run This cell to load the functions required for the exam, as well as all the dependencies and external libraries used in the process.



import nltk

# Tweet Sample Dataset
nltk.download('twitter_samples')

# POS Tagging
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

# Stop Words
nltk.download('stopwords')

# Numpy
import numpy as np

# Regular Expressions
import re

# DataFrames
import pandas as pd

# Math
import math

# Interactive Widgets
import ipywidgets as widgets
from ipywidgets import interact, interact_manual, FloatSlider, Layout

#Model Selection and Validation
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score

def printTokensInVocabs(tokens):
  counters = {'CountVectorizer': tfCounter,'TF Normalized':tfNormalizedCounter,'TfIdfVectorizer':tfIdfCounter}
  for counterName in counters:
    counter = counters[counterName]
    newTokens = []
    for token in tokenizedTweet:
      if token in tfIdfCounter.vocabulary_:
        newTokens.append(token)
    print(f'Tokens in the Vocabulary of {counterName}: \t{newTokens}')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from nltk.corpus import twitter_samples 

positiveTweets = twitter_samples.strings('positive_tweets.json')
negativeTweets = twitter_samples.strings('negative_tweets.json')

allTweets = []
allTweets.extend(positiveTweets)
allTweets.extend(negativeTweets)

nPositive = len(positiveTweets)
nNegative = len(negativeTweets)

positiveLabels = np.ones(nPositive)
negativeLabels = np.zeros(nNegative)

allLabels = []
allLabels.extend(positiveLabels)
allLabels.extend(negativeLabels)
allLabels = np.array(allLabels)

In [None]:
def preprocessTweet(tweet):
  tweet = re.sub('http[s]?://[\S]+', ' ', tweet)              # Remove URLs
  tweet = re.sub('[\w]+([._-]\w+)*@\w+([.]\w+)*', ' ', tweet) # Remove e-mails
  tweet = re.sub('@\S+','', tweet)                            # Remove mentions
  tweet = re.sub('\s+', ' ', tweet)                           # Replace repeated spaces to 1 single space
  return tweet

In [None]:
def cleanTokens(tokens):
  newTokens = []
  for token in tokens:
    token = token.lower()
    if re.match('^[_*#!$@<=^`>%&\'\"/()\[\]\-+,.:;?]$', token): # Remove tokens that are 1 single punctuation
      continue  
    if re.match('\d+', token): # Remove Numbers
      continue
    if re.match('#[\w\d]+', token): # Remove Hashtag
      token = token[1:]
    newTokens.append(token)
  return newTokens

In [None]:
def splitTokens(tokens):
  splitPattern = r'(?<=[a-z])(?=[A-Z])'
  newTokens = []
  for token in tokens:
    pieces = re.split(splitPattern, token)
    newTokens.extend(pieces)
  return newTokens

In [None]:
from nltk.tokenize import TweetTokenizer

def tokenizeTweet(tweet):
  tokens = TweetTokenizer().tokenize(tweet)
  splittedTokens = splitTokens(tokens)
  cleanedTokens = cleanTokens(splittedTokens)
  return cleanedTokens

In [None]:
from nltk.corpus import stopwords

englishStopWords = stopwords.words('english')

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
def buildVectorizers(max_features):
  # Term Frequency
  tfCounter = CountVectorizer(
    preprocessor = preprocessTweet,
    stop_words = englishStopWords,
    tokenizer = tokenizeTweet,
    max_features = max_features,
  )
  tfRepresentation = tfCounter.fit_transform(allTweets)

  # TF Normalized
  tfNormalizedCounter = TfidfVectorizer(
    use_idf = False, norm = 'l2', 
    preprocessor = preprocessTweet,
    stop_words = englishStopWords,
    tokenizer = tokenizeTweet,
    max_features = max_features,
  )
  tfNormalizedRepresentation = tfNormalizedCounter.fit_transform(allTweets)

  # TF-IDF Normalized
  tfIdfCounter = TfidfVectorizer(
    preprocessor = preprocessTweet,
    stop_words = englishStopWords,
    tokenizer = tokenizeTweet,
    max_features = max_features,
  )

  tfIdfRepresentation = tfIdfCounter.fit_transform(allTweets)

  return tfRepresentation, tfNormalizedRepresentation, tfIdfRepresentation, tfCounter, tfNormalizedCounter, tfIdfCounter

## ☑️ Pre-Task 1.4: Train Function (0 Points)

In [None]:
def trainAndEvaluate(tweets, labels):
  # Split Dataset in Train and Test
  X_train, X_test, y_train, y_test = train_test_split(tweets, labels, shuffle=True, random_state=10)
  
  # Build and Train the Model
  model = LogisticRegressionCV(max_iter=2000)
  model.fit(X_train,y_train)
  
  # Calculate Accuracy
  trainAcc = model.score(X_train, y_train)
  print(f'Train Accuracy: {trainAcc*100:.2f}%')
  testAcc = model.score(X_test, y_test)
  print(f'Test Accuracy: {testAcc*100:.2f}%\n')
  
  # Calculate other metrics
  tn, fp, fn, tp = confusion_matrix(labels, model.predict(tweets)).ravel()
  precision = tp / (tp + fp)
  sensitivity = tp / (tp + fn)
  specificity = tn / (tn + fp)
  print(f'Precision: {precision*100:.2f}%')
  print(f'Sensitivity: {sensitivity*100:.2f}%')
  print(f'Specificity: {specificity*100:.2f}%')

  # Return Variables
  results = {
      'model': model,
      'testAcc':testAcc,
      'trainAcc':trainAcc,
      'precision':precision,
      'sensitivity':sensitivity,
      'specificity':specificity
  }
  return results

In [None]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import confusion_matrix

In [None]:

max_features = 2000


tfRepresentation, tfNormalizedRepresentation, tfIdfRepresentation, tfCounter, tfNormalizedCounter, tfIdfCounter = buildVectorizers(max_features)
print('\n')
print('Model (1) Absolute Term Frequency (Word Count)\n')
tfResults = trainAndEvaluate(tfRepresentation, allLabels)
print('\n')
print('Model (2) Normalized Term Frequency (TF-IDF with IDF disabled)\n')
tfNormResults = trainAndEvaluate(tfNormalizedRepresentation, allLabels)
print('\n')
print('Model (3) TF-IDF Representation\n')
tfIdfResults = trainAndEvaluate(tfIdfRepresentation, allLabels)

pass





Model (1) Absolute Term Frequency (Word Count)

Train Accuracy: 99.99%
Test Accuracy: 99.48%

Precision: 99.74%
Sensitivity: 99.98%
Specificity: 99.74%


Model (2) Normalized Term Frequency (TF-IDF with IDF disabled)

Train Accuracy: 99.99%
Test Accuracy: 99.64%

Precision: 99.82%
Sensitivity: 99.98%
Specificity: 99.82%


Model (3) TF-IDF Representation

Train Accuracy: 99.95%
Test Accuracy: 99.80%

Precision: 99.86%
Sensitivity: 99.96%
Specificity: 99.86%


In [None]:
def trainAndSeeErrors(model, tweets, labels):
  matrix = tweets.toarray()
  for i in range(len(labels)):
    pred = model.predict([matrix[i]])
    real = labels[i]
    if pred != real:
      print('---------')
      if real == 1: label = '(+)'
      else: label = '(-)'
      print(f'{label} -> {allTweets[i]}')

In [None]:
trainAndSeeErrors(tfResults['model'], tfRepresentation, allLabels)

---------
(+) -> @ellekagaoan @chinmarquez Catch up once in a while :( &gt;:D&lt; @aditriphosphate @ErinMonzon
---------
(-) -> all time looww(:(
---------
(-) -> hugs baek tight : (
---------
(-) -> @wtfxmbs AMBS please it's harry's jeans :)):):):(
---------
(-) -> laomma design; Kebaya &amp; Wedding Dress. Bandung - Indonesia
LINE: laomma, 
7DF89150
WHATSAPP : (+62) 089624641747
Instagram : Laomma_Couture
---------
(-) -> @c_tuilagi Anytime Lil Nigga!! (: (:
---------
(-) -> i pOPPED CONFETTI THOUGH ! ! : ( https://t.co/Y79gPDxTIE
---------
(-) -> Zehr khany ka time is coming soon.....: (
---------
(-) -> Annnd, now not going to Winchester {:-(
---------
(-) -> pats jay : (
---------
(-) -> @bae_ts WHATEVER STIL L YOUNG &gt;:-(
---------
(-) -> the internet is being a total bitch : (
---------
(-) -> my beloved grandmother : ( https://t.co/wt4oXq5xCf
---------
(-) -> @CHEDA_KHAN Thats life. I get calls from people I havent seen in 20 years and its always favours : (


In [None]:
trainAndSeeErrors(tfNormResults['model'], tfNormalizedRepresentation, allLabels)

---------
(+) -> @ellekagaoan @chinmarquez Catch up once in a while :( &gt;:D&lt; @aditriphosphate @ErinMonzon
---------
(-) -> all time looww(:(
---------
(-) -> stu is mean, i just wanna sleep : (
---------
(-) -> @c_tuilagi Anytime Lil Nigga!! (: (:
---------
(-) -> i pOPPED CONFETTI THOUGH ! ! : ( https://t.co/Y79gPDxTIE
---------
(-) -> Zehr khany ka time is coming soon.....: (
---------
(-) -> Annnd, now not going to Winchester {:-(
---------
(-) -> my beloved grandmother : ( https://t.co/wt4oXq5xCf
---------
(-) -> @CHEDA_KHAN Thats life. I get calls from people I havent seen in 20 years and its always favours : (
---------
(-) -> Sr. Financial Analyst - Expedia, Inc.: (#Bellevue, WA) http://t.co/ktknMhvwCI #Finance #ExpediaJobs #Job #Jobs #Hiring


In [None]:
trainAndSeeErrors(tfIdfResults['model'], tfIdfRepresentation, allLabels)

---------
(+) -> Remember that one time I didn't go to flume/kaytranada/alunageorge even though I had tickets? I still want to kms. : ) : )
---------
(+) -> @ellekagaoan @chinmarquez Catch up once in a while :( &gt;:D&lt; @aditriphosphate @ErinMonzon
---------
(-) -> @Israelgirly They sure do, esp now when ppl are talking crap about Millie!! &gt;:( I'll go straight to that FB page:)
---------
(-) -> @wtfxmbs AMBS please it's harry's jeans :)):):):(
---------
(-) -> @c_tuilagi Anytime Lil Nigga!! (: (:
---------
(-) -> i pOPPED CONFETTI THOUGH ! ! : ( https://t.co/Y79gPDxTIE
---------
(-) -> Annnd, now not going to Winchester {:-(
---------
(-) -> my beloved grandmother : ( https://t.co/wt4oXq5xCf
---------
(-) -> Sr. Financial Analyst - Expedia, Inc.: (#Bellevue, WA) http://t.co/ktknMhvwCI #Finance #ExpediaJobs #Job #Jobs #Hiring


In [None]:
myTestTweet = '''Annnd, now not going to Winchester {:-('''
myTestTweet = myTestTweet.replace('\n',' ')
preprocessedTweet = preprocessTweet(myTestTweet)
tokenizedTweet = tokenizeTweet(preprocessedTweet)

print(f'Original Tweet:\t\t{myTestTweet}')
print(f'Pre-processed Tweet:\t{preprocessedTweet}')
print(f'Tokenized Tweet:\t{tokenizedTweet}')
printTokensInVocabs(tokenizedTweet)

Original Tweet:		Annnd, now not going to Winchester {:-(
Pre-processed Tweet:	Annnd, now not going to Winchester {:-(
Tokenized Tweet:	['annnd', 'now', 'not', 'going', 'to', 'winchester', '{:']
Tokens in the Vocabulary of CountVectorizer: 	['going']
Tokens in the Vocabulary of TF Normalized: 	['going']
Tokens in the Vocabulary of TfIdfVectorizer: 	['going']
