<a href="https://colab.research.google.com/github/DMonsia/scholarship/blob/main/l4scholarship.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Startup weekend: Création d'une application de recherche de bourses d'études

Ceci est un tutoriel pour la construction d'une application permettant de retrouver rapidement une offre de bourses d'études au Cameroun en utilisant la technologie no-code, l'intelligence artificielle et l'api de Twitter. 

## Push on gsheet

[plus](https://www.worthwebscraping.com/how-to-save-scraped-data-in-to-googlesheet/)

In [None]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials

In [None]:
gsread_credentials = '/content/serious-hall-335114-13cc64608fae.json'
scope = ['https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive',
         'https://www.googleapis.com/auth/drive.file',
         'https://www.googleapis.com/auth/spreadsheets'
         ]

credentials = ServiceAccountCredentials.from_json_keyfile_name(
    gsread_credentials, scope
    )
gc = gspread.authorize(credentials)
spreadsheet_key = '1qbj9euQ9rObKW3sa3luU5aFwxtchdLe_-ORSwZEt5qY'
spreadsheet = gc.open_by_key(spreadsheet_key)

In [None]:
def add_row(row, sheet_name='Tweets'):
    """This function add new row data in a sheet.
    Args:
        row (list): list of values to add to the sheet
        sheet_name (str): the name of the sheet in Google Spreadsheet.
    
    Returns:
        bool `True` if the insertion has been successful.
    """
    spreadsheet.values_append(sheet_name,
        params={'valueInputOption': 'USER_ENTERED'},
        body={'values': row}
        )
    return True

## Data enrichment

In [None]:
%%capture
!pip install transformers
!pip install sentencepiece

In [None]:
import sentencepiece
from transformers import pipeline

In [None]:
# load model from HuggingFace repository
repo_name = "BaptisteDoyen/camembert-base-xnli"
classifier = pipeline(task="zero-shot-classification",
                      model=repo_name)

Downloading:   0%|          | 0.00/882 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/422M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/299 [00:00<?, ?B/s]

In [None]:
def pred_category(text):
    """This function take a text and predicts whether it
    talk about Education or Finance.

    Args:
        text (str): the text whose membership class should be predict.
    
    Returns:
        A list `(category, prob)`
    """
    # this can be handle for better prediction
    candidate_labels = ["etude education","finance economie"]
    label_map = {
        'etude education': 'Education',
        'finance economie': "Finance"
    }  
    pred = classifier(text, candidate_labels)
    category = label_map[pred['labels'][0]]
    prob = round(pred['scores'][0], 4)
    return (category, prob)

## Get data with tweepy streaming api


[plus](https://docs.tweepy.org/en/stable/streamingclient.html#)

In [None]:
%%capture
!pip install tweepy==4.8.0

In [None]:
import ast
import json
import tweepy

In [None]:
# load credentials from the twitter developer api 
with open('/content/twitter_cred.json', 'r') as f:
    twitter_cred = json.load(f)

TWITTER_API_KEY = twitter_cred["API_KEY"]
TWITTER_API_S_KEY = twitter_cred["API_SECRET_KEY"]
BEARER_TOKEN = twitter_cred["BEARER_TOKEN"]
TWITTER_ACC_TOKEN = twitter_cred["ACCES_TOKEN"]
TWITTER_ACC_TOKEN_SECRET = twitter_cred["ACCES_SECRET_TOKEN"]

In [None]:
class Listener(tweepy.StreamingClient):
    """Filter and sample realtime Tweets with Twitter API v2"""

    def on_data(self, tweet):
        """This is called when raw data is received from the stream.
        Then handles and pushes the data to Google Sheets.
        """
        # transform raw data 'tweet' from bytes to dict using ast library
        tweet = tweet.decode("UTF-8")
        tweet = ast.literal_eval(tweet)

        text = tweet['data']['text'] 
        category = None
        prob = None
        category, prob = pred_category(text)

        media = None
        media_url = None
        try:
            media = tweet['includes'].get('media')[0]
            if media: 
                media_url = media.get('url')
            if not media_url:
                media_url = media.get('preview_image_url')
        except:
            # add default image if media url does not exist
            media_url = "https://drive.google.com/file/d/1IuSVo7OTx6VLcLsTm0y673JQJV-PtmkR/view?usp=sharing"
            if category=="Finance":
                media_url = "https://drive.google.com/file/d/15-zQfhbTZyZ6waluFQEr7NgQSzIPv9Ac/view?usp=sharing"

        row = [
               tweet['data']['id'],
               tweet['data']['created_at'],
               f"https://twitter.com/twitter/statuses/{tweet['data']['id']}",
               tweet['includes']['users'][0]['id'],
               tweet['includes']['users'][0]['name'],
               tweet['includes']['users'][0]['username'],
               media_url,
               text,
               category,
               prob
               ]
        row = [["" if val==None else val for val in row]]
        add_row(row, sheet_name='Tweets')
        return None

In [None]:
listner = Listener(BEARER_TOKEN, return_type=dict)
listner.add_rules(tweepy.StreamRule("bourse cameroun"))
#listner.add_rules(tweepy.StreamRule("scholarship cameroun"))

In [None]:
listner.filter(backfill_minutes=None,
               expansions=['attachments.media_keys','author_id'],
               media_fields=['url','preview_image_url'],
               place_fields=['country'],
               poll_fields=None,
               tweet_fields=['created_at','text','id'],
               user_fields=['name','id','username'],
               threaded=False)

## END