# Tweet pipeline

In [1]:
import sys
sys.path.append('/Users/efraflores/Desktop/hub/diplo/venv/lib/python3.9/site-packages')

## Evironment

In [2]:
from os import path
from pandas import read_csv

'''!pip install bs4'''
from re import sub, UNICODE, findall
from bs4 import BeautifulSoup
from unicodedata import normalize

'''
!pip install nltk
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
'''
from nltk.corpus import stopwords

import pickle

from numpy import append,zeros,array

from datetime import datetime

## Import

### Parameters

In [3]:
BASE_DIR = '/Users/efraflores/Desktop/EF/Diplo/data/05'
FILE_NAME = '0505_tuit.csv'
TOKENIZER = 'tuit_tokenizer.pickle'
MODEL_SUP = 'tuit_model_supervised.pickle'
MODEL_UNSUP = 'tuit_model_unsupervised.pickle'

### Data

In [4]:
df = read_csv(path.join(BASE_DIR,FILE_NAME),encoding='latin').set_index('tweet_id')
df.sample()

Unnamed: 0_level_0,tweet_date_created,tweet_text,language,sentiment,sentiment_score
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
993077604482736131,2018-05-06T10:38:46,La #PremierLeague ha anunciado a los seis cand...,es,NEUTRAL,"{""Neutral"":0.8101093769073486328125,""Negative""..."


## Functions

### Clean tweet

In [5]:
def clean_tweet(text):
  # Obtenemos el texto:
  text = BeautifulSoup(normalize('NFD', text).encode('ascii', 'ignore'), "lxml").get_text()
  # Eliminación de las menciones en los tweets. Se excluyen los @ juntos con todos los caracteres que le siguen (De la A a la Z, en minúsculas y mayusculas, y números del 0 al 9)
  #text = sub(r"@[A-Za-z0-9]+", ' ', text)
  # Eliminación de los links que inicien con https o http. Inicamos que la s es opcional (s?).
  text = sub(r"https?://[A-Za-z0-9./]+", ' ', text)
  # Eliminación de los links que inicien con www.
  text = sub(r"www.[A-Za-z0-9./]+", ' ', text)
  # Eliminación de todos los catacteres menos las letras y signos de puntuación. 
  text = sub(r"[^a-zA-Z.!?']", ' ', text)
  # Eliminamos espacios en blanco dobles.
  text = sub(r" +", ' ', text)
  return text.lower()

### Clean text

In [6]:
def clean_text(text,
               language='english',pattern="[^a-zA-Z'\s]",
               lower=False,lemma=False,rem_stopw=False,unique=False,
               add_stopw=[]):
    #It clean and can remove stopwords or even lemmatize words if specified in params
    cleaned_text = normalize('NFD',str(text).replace('\n',' \n ')).encode('ascii', 'ignore')
    cleaned_text = sub(pattern,' ',cleaned_text.decode('utf-8'),flags=UNICODE)
    cleaned_text = [(lem.lemmatize(word,pos='v') if lemma else word) for word in 
                    (cleaned_text.lower().split() if lower else cleaned_text.split())]
    if rem_stopw: cleaned_text = [word for word in cleaned_text if word not in 
                                  stopwords.words(language)+add_stopw]
    return ' '.join((set(cleaned_text) if unique else cleaned_text))

### Pad sequence

In [7]:
def manual_pad(array,n=170):
    return append(array,zeros(max(0,n-len(array))))

### Read objects

In [8]:
def read_obj():
    with open(path.join(BASE_DIR,TOKENIZER), 'rb') as f: 
        tokenizer = pickle.load(f)

    with open(path.join(BASE_DIR,MODEL_SUP), 'rb') as f: 
        model_sup = pickle.load(f)

    with open(path.join(BASE_DIR,MODEL_UNSUP), 'rb') as f: 
        model_unsup = pickle.load(f)
    return tokenizer,model_sup,model_unsup

### Full pipeline

In [16]:
def full_pipeline(text,fitted_tokenizer,supervised_model,unsupervised_model,
                  text_col='tweet_text'):
    original_text = text
    tokenizer = fitted_tokenizer
    model_sup = supervised_model
    model_unsup = unsupervised_model
    
    text = clean_tweet(text)
    text = ' '.join(text.split()[:170])
    text = tokenizer.encode(text,add_special_tokens=True)
    text = manual_pad(text)
    output = {'time_stamp':f'{datetime.now().strftime("%d/%m/%YT%H:%M")}',
              'team_name':'Untitled'}
    aux_dict = {}
    for x,y in zip(model_sup.classes_,model_sup.predict_proba([text])[0]):
        aux_dict[x] = round(y,3)
        
    rename_dict = {'proba_positive': 'POSITIVE', 
                   'proba_negative': 'NEGATIVE', 
                   'proba_neutral': 'NEUTRAL', 
                   'proba_mixed': 'MIXED'}
    
    for x,y in rename_dict.items():
        output[x] = aux_dict[rename_dict[x]]
    
    output['class'] = model_sup.predict([text])[0]
    
    text = original_text
    clean = clean_text(text,language='spanish',lower=True,pattern="[^a-zA-Z\s]")
    min_text = clean_text(text,language='spanish',lower=True,pattern="[^a-zA-Z\s]",
                          unique=True)

    var_unsup = [aux_dict['NEUTRAL'],aux_dict['NEGATIVE'],aux_dict['POSITIVE'],aux_dict['MIXED'],
                 len(text),len(min_text)/(len(clean)+1e-10),len(findall('@',text)),
                 len(findall('#',text)),len(findall('[A-Z]',text)),
                 len(text)/len(text.split())]
    
    cluster_dict = {1:'Amorosos',2:'Adictos al #',3:'Faroles',4:'Haters'}
    
    output['cluster'] = cluster_dict[model_unsup.predict(array((var_unsup,)))[0]]
    return output

## Predict

In [17]:
tokenizer,model_sup,model_unsup = read_obj()
text = df.sample()['tweet_text'].values[0]
full_pipeline(text,tokenizer,model_sup,model_unsup)

{'time_stamp': '05/06/2021T19:58',
 'team_name': 'Untitled',
 'proba_positive': 0.091,
 'proba_negative': 0.099,
 'proba_neutral': 0.803,
 'proba_mixed': 0.007,
 'class': 'NEUTRAL',
 'cluster': 'Faroles'}