In [None]:
import nltk
# Tweet Sample Dataset
nltk.download('twitter_samples')

# POS Tagging
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

# Lemmatizer 
nltk.download('wordnet')


# Stop Words
nltk.download('stopwords')
nltk.download('omw-1.4')

# Numpy
import numpy as np

# Regular Expressions
import re

# DataFrames
import pandas as pd



[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
#@title Auxiliar Functions for Evaluation ⚠️
#@markdown ⚡ Run This cell to load the functions that help you to validate if your tasks are correctly done or not


############    Validate equivalence    ############

def listsHaveSameValues(list1, list2):
    if type(list1) != list or type(list2) != list:
        return False
    if len(list1) != len(list2):
        return False
    for item in list1:
        if item not in list2:
            return False
    return True

def dictionariesHaveSameValues(dict1, dict2):
    if type(dict1) != dict or type(dict2) != dict:
        return False
    if len(dict1) != len(dict2):
        return False
    for key in dict1:
        if key not in dict2:
            return False
        if dict1[key] != dict2[key]:
            return False
    return True

def stringsHaveSameValues(str1, str2):
    return str1 == str2

############    Answer is Correct    ############

def answerIsCorrectList(correctAnswer, input, yourFunction):
  import types
  if not isinstance(yourFunction, types.FunctionType):
    return False
  yourAnswer = yourFunction(input)
  return listsHaveSameValues(correctAnswer, yourAnswer)

def answerIsCorrectDict(correctAnswer, input, yourFunction):
  import types
  if not isinstance(yourFunction, types.FunctionType):
    return False
  yourAnswer = yourFunction(input)
  return dictionariesHaveSameValues(correctAnswer, yourAnswer)

def answerIsCorrectString(correctAnswer, input, yourFunction):
  import types
  if not isinstance(yourFunction, types.FunctionType):
    return False
  yourAnswer = yourFunction(input)
  return stringsHaveSameValues(correctAnswer, yourAnswer)

############    Print Diffs    ############

def printDifferences(correctAnswer, yourAnswer, input):
      print(f'Input:\t\t{input}')
      print(f'Correct Answer:\t{correctAnswer}')
      print(f'Your Answer: \t{yourAnswer}')
      print()
  

def printDifferencesBetweenDicts(correctDict, yourDict, input=None):
    keysOnlyInCorrect = []
    keysOnlyInYours = []
    keysWithDifferentValues = []

    allKeys = []
    allKeys.extend(list(correctDict))
    allKeys.extend(list(yourDict))
    allKeys = set(allKeys)

    for key in allKeys:
      if (key in correctDict) and (key not in yourDict):
        keysOnlyInCorrect.append(key)
      elif (key in yourDict) and (key not in correctDict):
        keysOnlyInYours.append(key)
      elif correctDict[key] != yourDict[key]:
        keysWithDifferentValues.append(key)
    if (input != None):
      print(f'Input:\n{input}\n')
    print(f'Keys that you are missing:\n{keysOnlyInCorrect}\n')
    print(f'Keys that should not be in your answer:\n{keysOnlyInYours}\n')
    print(f'Keys with wrong values:\n{keysWithDifferentValues}')

############    Test Answer    ############

def testAnswers(yourImplementation, answersAndInputs, answerType):
  if answerType == 'list':
    answerIsCorrect = answerIsCorrectList
    printDiffs = printDifferences
  elif answerType == 'dict':
    answerIsCorrect = answerIsCorrectDict
    printDiffs = printDifferencesBetweenDicts
  elif answerType == 'string':
    answerIsCorrect = answerIsCorrectString
    printDiffs = printDifferences
  else:
    raise Exception(f'Answer Type is not recognized: {answerType}')
  import types
  if not isinstance(yourImplementation, types.FunctionType):
    raise Exception('Your implementation is not a function')
  nTests = len(answersAndInputs)
  for i in range(nTests):
    correctAnswer, input = answersAndInputs[i]
    print(f'Test {i+1}/{nTests} ', end='')
    if answerIsCorrect(correctAnswer, input, yourFunction=yourImplementation):
      print('✅')
      print(f'Input: \t{input}')
      print(f'Answer:\t{correctAnswer}')
      print()
    else:
      yourAnswer = yourImplementation(input)
      print('❌')
      printDiffs(correctAnswer, yourAnswer, input)

############    Print    ############

def showError(message, functionName):
    print(f'Error at Function {functionName}: {message}')

print('The auxiliar functions were loaded successfully')

The auxiliar functions were loaded successfully


In [None]:
def transformPosTag(posTag):
  tags = {'NOUN':'n','VERB':'v','ADJ':'a','ADV':'r'}
  newPosTag = tags.get(posTag)
  return newPosTag

In [None]:
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer

In [None]:
def lemmatizeTokens(tokens):
  lemmatizer = WordNetLemmatizer()
  lemmatizedTokens = []
  tokensWithPosTags = pos_tag(tokens,tagset='universal')
  for token,posTag in tokensWithPosTags:
    posTag = transformPosTag(posTag)
    if posTag == None:
      lemmatizedTokens.append(token)
    else:
      lemmatizedToken = lemmatizer.lemmatize(token,posTag)
  return lemmatizedTokens

In [None]:
tokens = 'two women , two oxen , two oases , two mice'.split()
print('Tokens:',tokens)
print('Lemmatized Tokens:',lemmatizeTokens(tokens))

Tokens: ['two', 'women', ',', 'two', 'oxen', ',', 'two', 'oases', ',', 'two', 'mice']
Lemmatized Tokens: ['two', 'woman', ',', 'two', 'ox', ',', 'two', 'oasis', ',', 'two', 'mouse']


In [None]:
#@title Test your implementation of lemmatizeTokens() ⚠️

#@markdown ⚡ Run this cell to validate if you implemented the function correctly

def checkLemmatizeTokens():
  answersAndInputs = [
    # Nouns and Verbs
    (['i','be','google','everything'],['i','am','googling','everything']),
    # Verbs
    (['play', 'with', 'that', 'be', 'very', 'dangerous'],['playing','with','that','is','very','dangerous']),
    # Noun
    (['animal', 'and', 'pet', 'be', 'beautiful'],['animals', 'and', 'pets', 'are', 'beautiful']),
    # Noun
    (['thanks', 'for', 'your', 'blessing'],['thanks', 'for', 'your', 'blessings']),
    # Comparatives and Superlatives
    (['you', 'be', 'smart', 'than', 'average,', 'but', 'not', 'the', 'smart'],['you', 'are', 'smarter', 'than', 'average,', 'but', 'not', 'the', 'smartest']),
    # Irregular conjugations
    (['two', 'woman', ',', 'two', 'ox', ',', 'two', 'oasis', ',', 'two', 'mouse'],['two', 'women', ',', 'two', 'oxen', ',', 'two', 'oases', ',', 'two', 'mice']
),
    # Empty Input should work too
    ([],[])
  ]
  yourImplementation = lemmatizeTokens
  answerType = 'list'
  testAnswers(yourImplementation, answersAndInputs, answerType)

checkLemmatizeTokens()

Test 1/7 ✅
Input: 	['i', 'am', 'googling', 'everything']
Answer:	['i', 'be', 'google', 'everything']

Test 2/7 ✅
Input: 	['playing', 'with', 'that', 'is', 'very', 'dangerous']
Answer:	['play', 'with', 'that', 'be', 'very', 'dangerous']

Test 3/7 ✅
Input: 	['animals', 'and', 'pets', 'are', 'beautiful']
Answer:	['animal', 'and', 'pet', 'be', 'beautiful']

Test 4/7 ✅
Input: 	['thanks', 'for', 'your', 'blessings']
Answer:	['thanks', 'for', 'your', 'blessing']

Test 5/7 ✅
Input: 	['you', 'are', 'smarter', 'than', 'average,', 'but', 'not', 'the', 'smartest']
Answer:	['you', 'be', 'smart', 'than', 'average,', 'but', 'not', 'the', 'smart']

Test 6/7 ✅
Input: 	['two', 'women', ',', 'two', 'oxen', ',', 'two', 'oases', ',', 'two', 'mice']
Answer:	['two', 'woman', ',', 'two', 'ox', ',', 'two', 'oasis', ',', 'two', 'mouse']

Test 7/7 ✅
Input: 	[]
Answer:	[]



In [None]:
def preprocessTweet(tweet):
  tweet = re.sub('http[s]?://[\S]+', ' ', tweet)              # Remove URLs
  tweet = re.sub('[\w]+([._-]\w+)*@\w+([.]\w+)*', ' ', tweet) # Remove e-mails
  tweet = re.sub('@\S+','', tweet)                            # Remove mentions
  tweet = re.sub('\s+', ' ', tweet)                           # Replace repeated spaces to 1 single space
  return tweet

In [None]:
def cleanTokens(tokens):
  newTokens = []
  for token in tokens:
    token = token.lower()
    if re.match('^[_*#!$@<=^`>%&\'\"/()\[\]\-+,.:;?]$', token): # Remove tokens that are 1 single punctuation
      continue  
    if re.match('\d+', token): # Remove Numbers
      continue
    if re.match('#[\w\d]+', token): # Remove Hashtag
      token = token[1:]
    newTokens.append(token)
  return newTokens


In [None]:
def splitTokens(tokens):
  splitPattern = r'(?<=[a-z])(?=[A-Z])'
  newTokens = []
  for token in tokens:
    pieces = re.split(splitPattern, token)
    newTokens.extend(pieces)
  return newTokens

In [None]:
from nltk.tokenize import TweetTokenizer

def tokenizeTweet(tweet):
  tokens = TweetTokenizer().tokenize(tweet)
  splittedTokens = splitTokens(tokens)
  cleanedTokens = cleanTokens(splittedTokens)
  lemmatizedTokens = lemmatizeTokens(cleanedTokens)
  return lemmatizedTokens

In [None]:
from nltk.corpus import stopwords

englishStopWords = stopwords.words('english')

In [None]:
from nltk.corpus import twitter_samples 

sampleSet = twitter_samples.strings('positive_tweets.json')

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

counter = CountVectorizer(
    preprocessor = preprocessTweet,
    stop_words = englishStopWords,
    tokenizer = tokenizeTweet,
    max_features = 900,
  )

In [None]:
allBagsOfWord = counter.fit_transform(sampleSet)
vocab = list(counter.get_feature_names_out())

print('\nIMPORTANT !!!!\n\n')
print('This is the vocabulary that you will need to read in order to finish Task 3 succesfully:\n')
print(vocab)




IMPORTANT !!!!


This is the vocabulary that you will need to read in order to finish Task 3 succesfully:

['):', '..', '...', ':)', ':-)', ':/', ':d', ':p', ';)', '<3', '\\', 'able', 'absolutely', 'account', 'act', 'actually', 'add', 'address', 'advice', 'af', 'afternoon', 'ago', 'agree', 'ah', 'air', 'al', 'album', 'almost', 'along', 'already', 'alright', 'also', 'always', 'amaze', 'amazing', 'android', 'another', 'answer', 'anyone', 'anything', 'anytime', 'anyway', 'apology', 'app', 'apparently', 'apply', 'appreciate', 'aqui', 'around', 'arrive', 'art', 'article', 'artist', 'ask', 'asleep', 'august', 'available', 'awake', 'away', 'awesome', 'aww', 'awww', 'awwww', 'b', 'babe', 'baby', 'back', 'bad', 'bae', 'bajrangi', 'ball', 'bam', 'bath', 'bc', 'bday', 'beat', 'beautiful', 'beauty', 'become', 'bed', 'believe', 'best', 'bestfriend', 'bet', 'bhaijaan', 'bi0', 'big', 'bill', 'birthday', 'bit', 'bless', 'blog', 'blue', 'body', 'book', 'bore', 'bot', 'boy', 'brain', 'brand', 'break', 

In [None]:
allStopWords = []
allStopWords.extend(englishStopWords)

In [None]:
from nltk.corpus import twitter_samples 

positiveTweets = twitter_samples.strings('positive_tweets.json')
negativeTweets = twitter_samples.strings('negative_tweets.json')

In [None]:
listAllTweets = []
listAllTweets.extend(positiveTweets)
listAllTweets.extend(negativeTweets)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

counter = CountVectorizer(
    preprocessor = preprocessTweet,
    stop_words = allStopWords,
    tokenizer = tokenizeTweet,
    max_features = 840,
  )

In [None]:
allTweets = counter.fit_transform(listAllTweets)



In [None]:
sizePositive = len(positiveTweets)
sizeNegative = len(negativeTweets)

positiveLabels = np.ones(sizePositive)
negativeLabels = np.zeros(sizeNegative)
allLabels = np.hstack((positiveLabels,negativeLabels))

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(allTweets, allLabels, shuffle=True)

In [None]:
from sklearn.linear_model import LogisticRegressionCV

In [None]:
model = LogisticRegressionCV(max_iter=2000)
model.fit(X_train,y_train)
print('The training of the Logistic Regression Model has finished :)')

The training of the Logistic Regression Model has finished :)


In [None]:
#@markdown Run this cell to get the accuracy in both datasets (Train and Test) using the threshold

threshold =   0.71#@param {type:"number"}

def getPrediction(probabilities, threshold):
  predictions = []
  for prob in probabilities:
    if prob >= threshold:
      pred = 1
    else:
      pred = 0
    predictions.append(pred)
  return predictions

testProbabilities = model.predict_proba(X_test)[:,1]
testPredictions = getPrediction(testProbabilities, threshold)

trainProbabilities = model.predict_proba(X_train)[:,1]
trainPredictions = getPrediction(trainProbabilities, threshold)

import sklearn

trainAccuracy = sklearn.metrics.accuracy_score(y_train, trainPredictions)
testAccuracy = sklearn.metrics.accuracy_score(y_test, testPredictions)

trainAccuracy *= 100
testAccuracy *= 100

print(f'Train Accuracy: {trainAccuracy:.2f}%')
print(f'Test Accuracy: {testAccuracy:.2f}%')

Train Accuracy: 99.93%
Test Accuracy: 99.84%


In [None]:
from sklearn.metrics import classification_report

print(f'\t((  Classification Report for Test Set  ))\n')

y_test_pred = model.predict(X_test)
print(classification_report(y_true=y_test , y_pred=y_test_pred))

	((  Classification Report for Test Set  ))

              precision    recall  f1-score   support

         0.0       1.00      0.99      1.00      1259
         1.0       0.99      1.00      1.00      1241

    accuracy                           1.00      2500
   macro avg       1.00      1.00      1.00      2500
weighted avg       1.00      1.00      1.00      2500



In [None]:
print(vocab)

['):', '..', '...', ':)', ':-)', ':/', ':d', ':p', ';)', '<3', '\\', 'able', 'absolutely', 'account', 'act', 'actually', 'add', 'address', 'advice', 'af', 'afternoon', 'ago', 'agree', 'ah', 'air', 'al', 'album', 'almost', 'along', 'already', 'alright', 'also', 'always', 'amaze', 'amazing', 'android', 'another', 'answer', 'anyone', 'anything', 'anytime', 'anyway', 'apology', 'app', 'apparently', 'apply', 'appreciate', 'aqui', 'around', 'arrive', 'art', 'article', 'artist', 'ask', 'asleep', 'august', 'available', 'awake', 'away', 'awesome', 'aww', 'awww', 'awwww', 'b', 'babe', 'baby', 'back', 'bad', 'bae', 'bajrangi', 'ball', 'bam', 'bath', 'bc', 'bday', 'beat', 'beautiful', 'beauty', 'become', 'bed', 'believe', 'best', 'bestfriend', 'bet', 'bhaijaan', 'bi0', 'big', 'bill', 'birthday', 'bit', 'bless', 'blog', 'blue', 'body', 'book', 'bore', 'bot', 'boy', 'brain', 'brand', 'break', 'brilliant', 'bring', 'bro', 'btw', 'buddy', 'build', 'bulb', 'business', 'busy', 'button', 'buy', 'bye', 'c

In [None]:
def predictTweet(tweet):
  tweets = [tweet]
  tweets = counter.transform(tweets)
  return model.predict(tweets)

def seePrediction(tweet):
  if predictTweet(tweet)[0] == 1:
    print('Your tweet was classified as Positive')
  else:
    print('Your tweet was classified as Negative')

In [None]:
positiveTweet1 = ''' 
        Just tried a new restaurant @bestrestauranthere and the food was absolutely amazing! Can't wait to go back again. 😊  #foodie #yum #delicious 
'''

seePrediction(positiveTweet1)

Your tweet was classified as Positive


#### False Negative

In [None]:
positiveTweet2 = ''' 
        Can't believe it's been a year already! Time flies...💜 #anniversary
'''

seePrediction(positiveTweet2)

Your tweet was classified as Negative


In [None]:
negativeTweet1 = ''' 
        It's amazing how one moment can change everything. Feeling heartbroken right now. ): #needtime
'''

seePrediction(negativeTweet1)

Your tweet was classified as Negative


#### False Positive

In [None]:
negativeTweet2 = ''' 
  This video made me think a lot https://www.youtube.com/ :d Can somebody talk to me? #feelingnotgood #needtotalk
'''

seePrediction(negativeTweet2)

Your tweet was classified as Positive
