# Tweet pipeline

In [1]:
import sys
sys.path.append('/Users/efraflores/Desktop/hub/diplo/venv/lib/python3.9/site-packages')

## Evironment

In [2]:
from os import path
from pickle import load
from datetime import datetime
from bs4 import BeautifulSoup
from unicodedata import normalize
from numpy import append,zeros,array
from re import sub, UNICODE, findall
from pandas import read_csv,DataFrame

## Import

### Parameters

In [3]:
BASE_DIR = '/Users/efraflores/Desktop/EF/Diplo/data/05'
FILE_NAME = '0505_tuit.csv'
MODEL_SUP = 'tuit_model_supervised.pickle'
MODEL_UNSUP = 'tuit_model_unsupervised.pickle'

### Data

In [4]:
df = read_csv(path.join(BASE_DIR,FILE_NAME),encoding='latin').set_index('tweet_id')
df.sample()

Unnamed: 0_level_0,tweet_date_created,tweet_text,language,sentiment,sentiment_score
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1033768043766312960,2018-08-26T17:28:02.960000,@edwikk @ChelseaFC @marcosalonso03 Sot ka qen ...,es,NEUTRAL,"{""Neutral"":0.88814127445220947265625,""Negative..."


## Functions

### Clean tweet

In [5]:
def clean_tweet(text):
    text = normalize('NFD', text).encode('ascii', 'ignore')
    text = sub("[^a-zA-Z'\s]",' ',text.decode('utf-8'),flags=UNICODE)
    # Eliminación de las menciones en los tweets. Se excluyen los @ junto con todos los caracteres que le siguen (De la A a la Z, en minúsculas y mayusculas, y números del 0 al 9)
    text = sub(r"@[A-Za-z0-9]+", ' ', text)
    # Eliminación de las menciones en los tweets. Se excluyen los # junto con todos los caracteres que le siguen (De la A a la Z, en minúsculas y mayusculas, y números del 0 al 9)
    text = sub(r"#[A-Za-z0-9]+", ' ', text)
    # Eliminación de los links que inicien con https o http. Inicamos que la s es opcional (s?).
    text = sub(r"https?://[A-Za-z0-9./]+", ' ', text)
    # Eliminación de los links que inicien con www.
    text = sub(r"www.[A-Za-z0-9./]+", ' ', text)
    # Eliminación de todos los catacteres menos las letras y signos de puntuación. 
    text = sub(r"[^a-zA-Z]", ' ', text)
    #Elimina caracteres múltiples
    text = sub(r'(\S)\1*',r'\1',text)
    # Eliminamos espacios en blanco dobles.
    text = sub(r" +", ' ', text)
    return text.lower()

### Read models

In [6]:
def read_models():
    with open(path.join(BASE_DIR,MODEL_SUP), 'rb') as f: 
        model_sup = load(f)

    with open(path.join(BASE_DIR,MODEL_UNSUP), 'rb') as f: 
        model_unsup = load(f)
    return model_sup,model_unsup

### Full pipeline

In [7]:
def full_pipeline(text,supervised_model,unsupervised_model):
    model_sup = supervised_model
    model_unsup = unsupervised_model
    
    df = DataFrame(text,index=[0],columns=['tweet_text'])
    df['len'] = df['tweet_text'].str.split().apply(len)
    df['clean_tweet'] = df['tweet_text'].apply(clean_tweet)
    #All hashtags or mentions
    df['#_or_@'] = df['tweet_text'].apply(lambda x: ' '.join(findall(r'[@|#]([\S]+)',x)))
    #Split them like "VamosAmerica" to "vamos america", "UFC" stands the same
    df['#_or_@'] = df['#_or_@'].apply(lambda x:sub(r'([A-Z])(?![A-Z])',lambda a:' '+a.group(1).lower(),x).strip())

    df['length'] = df['tweet_text'].apply(len)
    df['relevance'] = df['clean_tweet'].apply(len)/(df['length']+1e-10)

    df['n_mentions'] = df['tweet_text'].apply(lambda x: len(findall('@',x)))
    df['n_hashtags'] = df['tweet_text'].apply(lambda x: len(findall('#',x)))
    df['n_links'] = df['tweet_text'].apply(lambda x: len(findall('http',x)))
    df['n_uppercase'] = df['tweet_text'].apply(lambda x: len(findall('[A-Z]',x)))

    df['p_mentions'] = df['n_mentions'] / df['len']
    df['p_hashtags'] = df['n_hashtags'] / df['len']
    df['p_links'] = df['n_links'] / df['len']
    df['p_uppercase'] = df['n_uppercase'] /df['length']

    df['n_len_p_word'] = df['length'] / df['len']
    df['lpw_clean'] = df['clean_tweet'].apply(len) / df['len']

    df['tot_text'] = df['#_or_@']+" "+df['clean_tweet']
    
    X = df[['tot_text', 'len', 'length', 'relevance', 'n_mentions', 'n_hashtags',
            'n_links', 'n_uppercase', 'p_mentions', 'p_hashtags', 'p_links',
            'p_uppercase', 'n_len_p_word', 'lpw_clean']].copy()
    
    output = {'time_stamp':f'{datetime.now().strftime("%d/%m/%YT%H:%M")}',
              'team_name':'Untitled'}
    aux_dict = {}
    for x,y in zip(model_sup.classes_,model_sup.predict_proba(X)[0]):
        aux_dict[x] = round(y,3)
    rename_dict = {'proba_positive': 'POSITIVE', 
                   'proba_negative': 'NEGATIVE', 
                   'proba_neutral': 'NEUTRAL', 
                   'proba_mixed': 'MIXED'}
    for x,y in rename_dict.items():
        output[x] = aux_dict[rename_dict[x]]
    
    output['class'] = model_sup.predict(X)[0]
    
    
    var_unsup = ['len', 'length', 'relevance', 'n_mentions', 'n_hashtags',
                 'n_links', 'n_uppercase', 'p_mentions', 'p_hashtags', 'p_links',
                 'p_uppercase', 'n_len_p_word', 'lpw_clean']
    
    var_unsup =  append(df[var_unsup].values,[aux_dict['NEUTRAL'],aux_dict['NEGATIVE'],
                                                 aux_dict['POSITIVE'],aux_dict['MIXED']])
    
    cluster_dict = {1:'Indirectas',2:'Adictos al #',3:'Spam',4:'Haters'}
    
    output['cluster'] = cluster_dict[model_unsup.predict(array((var_unsup,)))[0]]
    return output

## Predict

In [8]:
model_sup,model_unsup = read_models()
text = df[df['sentiment']!='NEUTRAL'].sample()['tweet_text'].values[0]
full_pipeline(text,model_sup,model_unsup)

{'time_stamp': '07/06/2021T22:37',
 'team_name': 'Untitled',
 'proba_positive': 0.667,
 'proba_negative': 0.015,
 'proba_neutral': 0.297,
 'proba_mixed': 0.021,
 'class': 'POSITIVE',
 'cluster': 'Adictos al #'}