In [56]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import json
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from utils import clean_text, personalized_tokenizer

In [None]:
#Download required files stored in Google Drive
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1YhFADfXR26nTdR6inRu0WFfA-LTgNS7h' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1YhFADfXR26nTdR6inRu0WFfA-LTgNS7h" -O tweets_with_authority.csv.zip && rm -rf /tmp/cookies.txt
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1B_RrXtR-GI1VHyRJs49Y8_eh492ZNtFw' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1B_RrXtR-GI1VHyRJs49Y8_eh492ZNtFw" -O inverted_index.json && rm -rf /tmp/cookies.txt
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1-14RBWKT1lR0QxFxFnkIbClgtJLpT5q9' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1-14RBWKT1lR0QxFxFnkIbClgtJLpT5q9" -O vectorizer.pickle && rm -rf /tmp/cookies.txt

In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/javi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# SEARCH ENGINE

It is required to construct the search engine previously with the module `Search Engine Construction`. Once it is executed, this can be used independently.

In [24]:
AUTHORITY_DATASET = "tweets_with_authority.csv.zip"
INVERTED_INDEX = "inverted_index.json"
VECTORIZER = "vectorizer.pickle"
INPUT_PATH = "./"
MODES = ['1', '2']

In [53]:
class TwitterSearch:
    def __init__(self):
        self.data, self.corpus, self.vectorizer, self.inverted_index = self._load_information()

    def _load_information(self):
        # Load pretrained vectorizer
        vectorizer = pickle.load(open(INPUT_PATH + VECTORIZER, "rb"))

        # Load corpus
        df = pd.read_csv(INPUT_PATH + AUTHORITY_DATASET)
        corpus = df['clean_text']
        corpus = corpus.fillna('')
        corpus = vectorizer.transform(corpus)
        
        with open(INPUT_PATH + INVERTED_INDEX, 'r') as f:
            inverted_index = json.load(f)

        return df, corpus, vectorizer, inverted_index

    def _get_tweet_fields(self, i):
        """
        Returns the relevant fields for each tweet
        i: id of the tweet we want to extract the information
        returns various fields needed for showing the result to the user
        """
        df = self.data
        user_name = eval(df.iloc[i]['user'])['name']
        text = df.iloc[i]['full_text']
        entities = eval(df.iloc[i]['entities'])
        urls = entities['urls']
        if urls:
            url = urls[0]['url']
            text = text.replace(url, '')
        else:
            url = 'No url'

        hashtags = entities['hashtags']

        if not hashtags:
            hashtags = 'No hashtags'

        favorite_count = df.iloc[i]['favorite_count']
        retweet_count = df.iloc[i]['retweet_count']
        followers_count = df.iloc[i]['followers_count']

        return user_name, text, url, hashtags, favorite_count, retweet_count, followers_count

    def find_full_match_docs(self, query):
        """
        Return the indexes of the documents containing all terms in the query
        """
        docs = None

        for word in query.split():
            if docs is None:
                docs = set([i[0] for i in self.inverted_index[word]])
            else:
                docs = docs.intersection(set([i[0] for i in self.inverted_index[word]]))
        return list(self.data[self.data['id_str'].isin(docs)].index)

    
    def return_top_n_doc(self,query,n,show = True,authority = None):
        """
        query: Query that the user writes.
        tf_idf: dataframe containing tfidf weights for each word in each doc
        n: number of doc to return to the user
        show: if you want to visualize the results

        returns a list with the most top n relevant tweets
        """
        assert n>0, "n should be a positive integer"
        query = clean_text(query) #noramalize the query
        query_vec = self.vectorizer.transform([query]) #calculate tdidf
        results = cosine_similarity(self.corpus, query_vec)
        results = results.flatten()

        documents_retrieved = []

        #######Return the results#########
        rank=0

        if authority is not None:
            results = 3*results*0.5*authority

        # Reverse the results
        results = results.argsort()[::-1]

        ## Generate print mask for results

        # The mask will contain the indexes from the results array in printing order
        # By default this mask will be the first n results of our cosine similarity output
        mask = [i for i in range(n)]

        # We find those documents that contain all the terms in the query
        full_matches = np.array(self.find_full_match_docs(query))

        # If we have more full matches than desired results, we just use them in order to print
        if len(full_matches)>=n:
            mask = list(np.where(np.isin(results, full_matches))[0])

        elif len(full_matches)==0:
            pass    
        # If not, we will include first those with full match and the remaining ones will be ordered
        # simply by cosine similarity
        else:
            full_rank = 0

            for i in range(len(results)):
                if results[i] in full_matches:
                    # Insert the full matches at the beggining to preserve the order of the remaining results
                    mask.insert(full_rank, i) 
                    full_rank+=1

        # Ensure we will only print n results
        mask = mask[:n]

        # Print following the order determined by the mask
        for i in mask:
            i = int(i)
            user_name, text, url, hashtags, favorites, retweets, followers = self._get_tweet_fields(results[i])
            if show == True:
                print("-->",rank + 1)
                print(text," | ",user_name," | ",self.data.iloc[results[i]]['created_at']," | ", hashtags[:] ," | ",favorites," | ", retweets," | ",url, " | ", followers)
            
            documents_retrieved.append(results[i])
            rank +=1

        return documents_retrieved

    def query(self, query, n=20, authority=None):
        self.return_top_n_doc(query, n, authority)

    def interface(self):
        while True:
            n = int(input("Enter the desired number of results: "))
            assert n>0, "The number of results must be a positive integer number"
            while True:
                mode = str(input("""Which mode would you like to use (insert number for the desired option)\n1: TF-IDF\n2: TF-IDF and authority\n"""))
                
                if mode in MODES:
                    break
                else:
                    print("Please insert some of these options: {}".format(', '.join(MODES)))

            query = input("Enter your query: ")
            if mode == "1":
                self.query(query, n)
            elif mode == "2":
                self.query(query, n, authority=True)


In [54]:
SE = TwitterSearch()

In [55]:
SE.interface()

Enter the desired number of results: 10
Which mode would you like to use (insert number for the desired option)
1: TF-IDF
2: TF-IDF and authority
2
Enter your query: edu tonto
--> 1
@victoriambb Tonto covid :(  |  Luisa  |  2020-11-19 17:04:30+00:00  |  No hashtags  |  1  |  0  |  No url  |  244
--> 2
Scottish Government going tonto on the "Change Expectations" button on Fitba Manager here.   |  Jay H  |  2020-11-19 16:00:26+00:00  |  No hashtags  |  1  |  1  |  https://t.co/QxUzJn83Ud  |  2770
--> 3
This is the greatest news!

Download the app and login with you  or .edu email address and free yoga practice 🧘‍♀️ 

Highly recommend ✨ https://t.co/GKdGEeHJaC  |  Bethany Kate  |  2020-11-19 17:28:11+00:00  |  No hashtags  |  0  |  1  |  https://t.co/Bh8ZAk31mD  |  520
--> 4
❗️My fave🧘🏻‍♀️💪app @downdogapp has reopened free usage for students, teachers &amp; healthcare workers due to covid surge. 
Please take care of yourselves. ❤️
•Anyone with a .edu email
•Students/teachers 
•Healthcare 

KeyboardInterrupt: Interrupted by user