# Actividad Integradora 2

In [None]:
!pip install praw

## Investiguen e incorporen una base de datos que tenga texto etiquetado como positivo o negativo. (10 puntos)

In [5]:
import nltk
import random
from nltk.classify.scikitlearn import SklearnClassifier
import pickle
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
from nltk.tokenize import word_tokenize

In [None]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)

    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf


short_pos = open("short_reviews/positive.txt", "r").read()
short_neg = open("short_reviews/negative.txt", "r").read()


all_words = []
documents = []


allowed_word_types = ["J"]

for p in short_pos.split('\n'):
    documents.append((p, "pos"))
    words = word_tokenize(p)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())

for p in short_neg.split('\n'):
    documents.append((p, "neg"))
    words = word_tokenize(p)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())
            
all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:5000]

def find_features(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features


featuresets = [(find_features(rev), category) for (rev, category) in documents]

random.shuffle(featuresets)
print(len(featuresets))

testing_set = featuresets[10000:]
training_set = featuresets[:10000]

10664


In [None]:
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set)) * 100)

save_classifier = open("pickled_algos/BernoulliNB_classifier5k.pickle", "wb")
pickle.dump(BernoulliNB_classifier, save_classifier)
save_classifier.close()

BernoulliNB_classifier accuracy percent: 73.94578313253012


In [None]:
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set)) * 100)

save_classifier = open("pickled_algos/LogisticRegression_classifier5k.pickle", "wb")
pickle.dump(LogisticRegression_classifier, save_classifier)
save_classifier.close()

LogisticRegression_classifier accuracy percent: 69.87951807228916


In [None]:
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set)) * 100)

save_classifier = open("pickled_algos/LinearSVC_classifier5k.pickle", "wb")
pickle.dump(LinearSVC_classifier, save_classifier)
save_classifier.close()

LinearSVC_classifier accuracy percent: 69.27710843373494


In [None]:
SGDC_classifier = SklearnClassifier(SGDClassifier())
SGDC_classifier.train(training_set)
print("SGDClassifier accuracy percent:", nltk.classify.accuracy(SGDC_classifier, testing_set) * 100)

save_classifier = open("pickled_algos/SGDC_classifier5k.pickle", "wb")
pickle.dump(SGDC_classifier, save_classifier)
save_classifier.close()

SGDClassifier accuracy percent: 68.07228915662651


In [None]:
classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

###############
save_classifier = open("pickled_algos/originalnaivebayes5k.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

Original Naive Bayes Algo accuracy percent: 72.7409638554217
Most Informative Features
                  boring = True              neg : pos    =     20.7 : 1.0
              engrossing = True              pos : neg    =     18.9 : 1.0
                 generic = True              neg : pos    =     16.4 : 1.0
                mediocre = True              neg : pos    =     16.4 : 1.0
                    loud = True              neg : pos    =     15.8 : 1.0
                    flat = True              neg : pos    =     15.1 : 1.0
              delightful = True              pos : neg    =     14.9 : 1.0
                 routine = True              neg : pos    =     14.4 : 1.0
               inventive = True              pos : neg    =     14.2 : 1.0
                  unique = True              pos : neg    =     13.6 : 1.0
              refreshing = True              pos : neg    =     12.3 : 1.0
               wonderful = True              pos : neg    =     12.1 : 1.0
             

In [None]:
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set)) * 100)

save_classifier = open("pickled_algos/MNB_classifier5k.pickle", "wb")
pickle.dump(MNB_classifier, save_classifier)
save_classifier.close()

MNB_classifier accuracy percent: 71.23493975903614


In [None]:
voted_classifier = VoteClassifier(
    classifier,
    LinearSVC_classifier,
    MNB_classifier,
    BernoulliNB_classifier,
    LogisticRegression_classifier)

print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set)) * 100)


def sentiment(text):
    feats = find_features(text)

    return voted_classifier.classify(feats)


voted_classifier accuracy percent: 73.19277108433735


In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
import tweepy
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import json
import sentiment_mod as s

#consumer key, consumer secret, access token, access secret.
ckey="MkFddAOkMPMZYAcNE1IcHusmz"
csecret="fJlyCNbZt7hcrMTlynbZWmP4R7h9oenhCvyJF8KEea0w20n8KD"
atoken="1411954077039222790-6nQZdgAYqiw4YBnpmYCQDZjxOzj1Cx"
asecret="kMlJx2MX62ucG7izINcyMxmO8s0ZoYTbqsbVcaq71k9Iu"

class listener(StreamListener):

    def on_data(self, data):
          all_data = json.loads(data)

          tweet = all_data["text"]
          sentiment_value, confidence = s.sentiment(tweet)
        
          print('\n')
          print(tweet)
          print('\n')
          print("******************************")
          print('\n')
          print(sentiment_value, confidence)
          print('\n')
          print("=======================================================================")
          print('\n')
          
          if confidence*100 >= 80:
              output = open("twitter-out.txt","a")
              output.write(sentiment_value)
              output.write('\n')
              output.close()

          return True

    def on_error(self, status):
        print(status)


In [None]:
class listener(StreamListener):

    def on_data(self, data):
        
        all_data = json.loads(data)

        tweet = all_data["text"]
        sentiment_value, confidence = s.sentiment(tweet)
        print(tweet, sentiment_value, confidence)
        
        if confidence*100 >= 80:
            output = open("twitter-out.txt","a")
            output.write(sentiment_value)
            output.write('\n')
            output.close()

        return True

    def on_error(self, status):
        print(status)

In [8]:
auth = OAuthHandler(ckey, csecret)
auth.set_access_token(atoken, asecret)

In [9]:
twitterStream = Stream(auth, listener())
twitterStream.filter(track=["covid"])



RT @javi_montoro: Lo de Miguel Bosé ha pasado ya de ser la opinión de un friki a un problema de salud pública. Si en vez de salir diciendo…


******************************


pos 0.6






RT @RonnieMotes8: @GOPLeader And this was months ago
https://t.co/HjGtzExesD
COVID-19 has hit people in Republican-led states hardest ... h…


******************************


neg 0.8






@Claudiashein @fDiIntelligence Cuánto más tiempo piensas esperar para actuar el 30 %de mis amigos ya tienen COVID.… https://t.co/zJtRHqw6Zk


******************************


pos 0.8






Roll on Saturday in Leeds 🤣🤣


******************************


neg 0.8






RT @delfinocrc: ÚLTIMA HORA: CCSS ordena apertura generalizada de vacunación contra COVID-19 a personas de 30 años en adelante, con o sin r…


******************************


pos 1.0






@RunninXC @ihatejohncronce @franke224 @SethWalder Without COVID those people would not have died. People live with… https://t.co/jRsSGfiubf


********************

KeyboardInterrupt: ignored

## Incorporar la funcionalidad de buscar Tweets por usuarios. (5puntos)

In [10]:
def tweets_by_user(user):
  userID = user
  auth = tweepy.OAuthHandler(ckey, csecret)
  auth.set_access_token(atoken, asecret)
  api = tweepy.API(auth)

  tweets = api.user_timeline(screen_name=userID, 
                            count=200,
                            include_rts = False,
                            tweet_mode = 'extended'
                            )
  return tweets

In [11]:
userID = "elonmusk"
print("USER ID: ", userID)
tweets = tweets_by_user(userID)
for info in tweets:
     print("ID: {}".format(info.id))
     print(info.created_at)
     print(info.full_text)
     print("\n")

USER ID:  elonmusk
ID: 1417207016519454724
2021-07-19 19:37:37
@jack @BitcoinMagazine @CathieDWood Sure, I have a ton


ID: 1417204066552205332
2021-07-19 19:25:54
@BitcoinMagazine @jack @CathieDWood During this talk, we will sing a cover of The Final Countdown by Europe https://t.co/7YUXiW8dhd


ID: 1416970848104173574
2021-07-19 03:59:10
@DragTimes @Tesla Nice


ID: 1416961748138033152
2021-07-19 03:23:01
@grimnut @Tesla @WholeMarsBlog @DirtyTesla Haha


ID: 1416951898049896450
2021-07-19 02:43:52
@WholeMarsBlog You don’t even need to touch the shifter in new S. Auto detect direction will come as an optional setting to all cars with FSD.


ID: 1416780739228602379
2021-07-18 15:23:45
@thePiggsBoson Problem 1st, theory 2nd is for sure way to go, as it establishes relevance, thus improving memory retention


ID: 1416663763151949824
2021-07-18 07:38:56
Cybrrrtruck https://t.co/rdiMFdYOS6


ID: 1416593609302945792
2021-07-18 03:00:10
@ArtifactsHub And all-time hodl champion


ID: 14165919

## Integren  una  funcionalidad  para  transcribir audio  a  texto  y  realizar  un  análisis  de  sentimiento sobre lo transcrito. (10 puntos)

In [13]:
!pip install speechbrain

Collecting speechbrain
  Downloading speechbrain-0.5.9-py3-none-any.whl (358 kB)
[K     |████████████████████████████████| 358 kB 5.0 MB/s 
Collecting huggingface-hub
  Downloading huggingface_hub-0.0.14-py3-none-any.whl (43 kB)
[K     |████████████████████████████████| 43 kB 1.5 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 41.1 MB/s 
Collecting hyperpyyaml
  Downloading HyperPyYAML-1.0.0-py3-none-any.whl (15 kB)
Collecting torchaudio
  Downloading torchaudio-0.9.0-cp37-cp37m-manylinux1_x86_64.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 49.0 MB/s 
Collecting ruamel.yaml>=0.15
  Downloading ruamel.yaml-0.17.10-py3-none-any.whl (108 kB)
[K     |████████████████████████████████| 108 kB 56.0 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████

In [14]:
import speechbrain as sb
from speechbrain.pretrained import EncoderDecoderASR

In [15]:
from speechbrain.dataio.dataio import read_audio
from IPython.display import Audio

In [16]:

asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-crdnn-rnnlm-librispeech", savedir="pretrained_models/asr-crdnn-rnnlm-librispeech")


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=4420.0, style=ProgressStyle(description…




  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=479555971.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=212420087.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=253217.0, style=ProgressStyle(descripti…




In [19]:
text = asr_model.transcribe_file("positive-audio.ogg")

In [21]:
text = str(text).lower() 
text = text.capitalize()
sentiment_value, confidence = s.sentiment(text)
print(text, sentiment_value, confidence)

The movement was amazing and had great music pos 1.0


## Incluirla funcionalidad de buscar posts en Reddit por usuarios. A las publicaciones recolectadas de  Reddit  se  les  debe  hacer  un  procesamiento  para  eliminar hyperlinks,  menciones a  otros subreddits (r/smashbros)y menciones aotros usuarios (/Username).(10 puntos)

In [None]:
!pip install praw

Collecting praw
  Downloading praw-7.3.0-py3-none-any.whl (165 kB)
Collecting websocket-client>=0.54.0
  Downloading websocket_client-1.1.0-py2.py3-none-any.whl (68 kB)
Collecting update-checker>=0.18
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Collecting prawcore<3,>=2.1
  Downloading prawcore-2.2.0-py3-none-any.whl (15 kB)
Installing collected packages: websocket-client, update-checker, prawcore, praw
Successfully installed praw-7.3.0 prawcore-2.2.0 update-checker-0.18.0 websocket-client-1.1.0


In [None]:
import praw

reddit = praw.Reddit(client_id='6cOKIV7_OKBd_-6kqOxYOg',
                     client_secret='3xEPxxGZPrtLBaRXPmIG1HBEZOARwQ', password='Ece321ad!',
                     user_agent='uwu', username='onesleepybird')

In [None]:
subreddit = reddit.subreddit('python')

In [None]:
hot_python = subreddit.hot()

In [None]:
hot_python = subreddit.hot(limit=3)
for submission in hot_python:
    if not submission.stickied:
        print('Title: {}, ups: {}, downs: {}, Have we visited?: {}'.format(submission.title,
                                                                           submission.ups,
                                                                           submission.downs,
                                                                           submission.visited))

Title: My first big project in python (Im really proud of it!!!), ups: 461, downs: 0, Have we visited?: False


In [None]:
### Implementación reddit
### Buscar un 
import re
from pprint import pprint
import praw

r = praw.Reddit(client_id='6cOKIV7_OKBd_-6kqOxYOg',
                     client_secret='3xEPxxGZPrtLBaRXPmIG1HBEZOARwQ', password='Ece321ad!',
                     user_agent='uwu', username='onesleepybird')
submissions =  r.redditor("hoosakiwi").submissions.hot()

self_texts = []
for link in submissions:
    ## aquí ya se limpia el post/reddit
    print("----", ' '.join(re.sub("(/[A-Za-z0-9]+)|(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|(/+:\/\/\S+)"," ",link.title).split()), "----")
    print(' '.join(re.sub("(/[A-Za-z0-9]+)|(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",link.selftext).split()), "\n")


---- Trump Organization Is Charged in 15 Year Tax Scheme ----
 

---- New York Suspends Giuliani s Law License ----
 

---- Trump Justice Department monitored Washington Post reporters phone calls in 2017 ----
 

---- r Lounge ----
removed 

---- Requesting r Think we can use it to host predictions for the league of legends subreddit ----
 

---- Australia floods Thousands to be evacuated as downpours worsen ----
 

---- r is looking for new moderators ----
Edit We are no longer accepting applications If you do not hear from us in 2 3 weeks then we have decided not to move forward with your application nbsp Hello Everyone We are looking to recruit new mods to our team You do not need previous moderation experience but you should be willing to spend at minimum 7 hours a week helping to moderate the subreddit So what does that entail As a mod you will Review community reported posts and approve or remove them as per our rules Respond to modmails from the community Warn and ban users who 

## Tres arquitecturas diferentes de RNN para la clasificación de texto como positivo o negativo. Los tres modelos deben ser integrados al modelo de ensamble original. (15 puntos)

## Funcionalidad Adicional:  Clasificar  comentarios  de  Youtube (10 puntos)

In [None]:
import argparse
import json
import os
import random
import sys
import textwrap
from itertools import chain
from urllib.request import urlopen
from urllib.parse import urlencode


def get_api_key():
    try:
        return os.environ['YOUTUBE_API_KEY']
    except KeyError:
        msg = '''You must create and export a YOUTUBE_API_KEY, instructions:
    1. Go-to https://console.developers.google.com/apis/credentials
    2. Create credentials with an "API Key" type
    3. export YOUTUBE_API_KEY=<your key goes here>'''
        print(msg)
        sys.exit(1)


def check_youtube_video_id(str):
    str = str.strip()

    # This is a bit naive but it should be good enough to protect against
    # accidentally pasting in the wrong value.
    if len(str) == 11:
        return str
    else:
        msg = f'not a valid YouTube video id: "{str}"'
        raise argparse.ArgumentTypeError(msg)


def check_positive_int(val):
    int_val = int(val)

    if int_val >= 0:
        return int_val
    else:
        msg = f'must be a positive integer: "{val}"'
        raise argparse.ArgumentTypeError(msg)


def check_omit_authors(omit_authors):
    if omit_authors == '':
        return []

    return list(map(str.strip, omit_authors.split(',')))


def display_names(results, is_verbose):
    authors = []

    for item in results['items']:
        comment = item['snippet']['topLevelComment']['snippet']
        author = comment['authorDisplayName']
        authors.append(author)

        if is_verbose:
            print(f'  {author}')

    return authors


def progress(page_count):
    if page_count > 1:
        print('')

    print(f'Getting comments for page {page_count}...')

    return page_count + 1


def get_comments(api_params):
    api_endpoint = 'https://www.googleapis.com/youtube/v3/commentThreads'
    encoded_params = urlencode(api_params)

    with urlopen(f'{api_endpoint}?{encoded_params}') as response:
        return json.load(response)


def get_comment_authors(api_token, video_id, is_verbose):
    authors = []
    page_count = 1

    api_params = {
        'key': api_token,
        'part': 'snippet',
        'videoId': video_id,
        'maxResults': 100,
    }

    results = get_comments(api_params)
    page_count = progress(page_count)
    authors.append(display_names(results, is_verbose))

    next_page_token = results.get('nextPageToken')

    while next_page_token:
        page_count = progress(page_count)

        api_params['pageToken'] = next_page_token
        results = get_comments(api_params)
        authors.append(display_names(results, is_verbose))

        next_page_token = results.get('nextPageToken')

    return authors


def flatten_list(items):
    return list(chain.from_iterable(items))


def sorted_unique_list(items):
    return sorted(list(set(items)))


def remove_authors(items, skip_items):
    #  This could be rewritten as a list comprehension such as:
    #   return [item.strip() for item in items if item not in skip_items]
    #
    # But IMO this approach is much more readable.
    for item in skip_items:
        if item.strip() in items:
            items.remove(item)

    return items


def pick_winners(authors, authors_count, winner_count):
    # We can't pick more winners than we have in total.
    if winner_count > authors_count:
        winner_count = authors_count

    return (random.sample(authors, winner_count), winner_count)


def parseargs():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=textwrap.dedent('''\
        Get a list of top level comments from a YouTube video and then
        pick N amount of unique comment authors by choosing them randomly.
        '''))

    parser.add_argument('video_id', default=None,
                        metavar='VIDEO_ID', type=check_youtube_video_id,
                        help='the 11 characters after ?v= in a YouTube URL')

    parser.add_argument('--winners', default=10, type=check_positive_int,
                        metavar='WINNERS',
                        help='number of winners to pick (defaults to 10)')

    parser.add_argument('--omit-authors', default='', type=check_omit_authors,
                        metavar='OMIT_AUTHORS',
                        help='comma separated list of author names to omit')

    parser.add_argument('--verbose', default=False, type=bool, nargs='?',
                        const=True, metavar='BOOL',
                        help='output author display names during the progress')

    return parser.parse_args()


def generate_report(winners, winner_count, authors_count,
                    duplicate_authors_count, omit_authors_count,
                    authors_final_count):
    winners = '\n  '.join(winners)

    print(f'''
{authors_count} top level comments were returned
- {duplicate_authors_count} comment(s) had duplicate authors
- {omit_authors_count} comment authors were explicitly omit
= {authors_final_count} comment authors have a chance to win
Winners ({winner_count}):
  {winners}''')

    return None


if __name__ == '__main__':
    args = parseargs()

    omit_authors_count = len(args.omit_authors)

    authors = get_comment_authors(get_api_key(), args.video_id, args.verbose)
    authors = flatten_list(authors)
    authors_count = len(authors)

    authors = sorted_unique_list(authors)
    authors_unique_count = len(authors)

    authors = remove_authors(authors, args.omit_authors)
    authors_final_count = len(authors)

    duplicate_authors_count = authors_count - authors_unique_count

    winners, winner_count = pick_winners(authors, authors_final_count,
                                         args.winners)

    generate_report(winners=winners,
                    winner_count=winner_count,
                    authors_count=authors_count,
                    duplicate_authors_count=duplicate_authors_count,
                    omit_authors_count=omit_authors_count,
                    authors_final_count=authors_final_count)

usage: ipykernel_launcher.py [-h] [--winners WINNERS]
                             [--omit-authors OMIT_AUTHORS] [--verbose [BOOL]]
                             VIDEO_ID
ipykernel_launcher.py: error: argument VIDEO_ID: not a valid YouTube video id: "C:\Users\donal\AppData\Roaming\jupyter\runtime\kernel-a7b534df-1a70-42e4-9643-8902d16b4e90.json"
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\donal\anaconda3\lib\argparse.py", line 2422, in _get_value
    result = type_func(arg_string)
  File "<ipython-input-78-f4cf9078cd6c>", line 34, in check_youtube_video_id
    raise argparse.ArgumentTypeError(msg)
argparse.ArgumentTypeError: not a valid YouTube video id: "C:\Users\donal\AppData\Roaming\jupyter\runtime\kernel-a7b534df-1a70-42e4-9643-8902d16b4e90.json"

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\donal\anaconda3\lib\argparse.py", line 1800, in parse_known_args
    namespace, args = self._parse_known_args(args, namespace)
  File "C:\Users\donal\anaconda3\lib\argparse.py", line 2009, in _parse_known_args
    stop_index = consume_positionals(start_index)
  File "C:\Users\donal\anaconda3\lib\argparse.py", line 1965, in consume_positionals
    take_action(action, args)
  File "C:\Users\donal\anaconda3\lib\argparse.py", line 1858, in take_action
    a

TypeError: object of type 'NoneType' has no len()

## Funcionalidad Adicional: .....

## Funcionalidad Adicional: .....