In [171]:
import pandas as pd
import numpy as np
import re
import string
from wordcloud import STOPWORDS
from flashtext import KeywordProcessor

In [172]:
# This will allow you to see all column names & rows when you are doing .head(). None of the column name will be truncated.
# source: https://stackoverflow.com/questions/49188960/how-to-show-all-of-columns-name-on-pandas-dataframe

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [173]:
# Function which plays a beep of given duration and frequency.
# Useful for when executing things that need a while to finish, to get notified.
import os
def beep(duration = 1, freq = 1500):
    """ play tone of duration in seconds and freq in Hz. """
    os.system('play --no-show-progress --null --channels 1 synth %s sine %f' % (duration, freq))

In [174]:
train = pd.read_csv('../data/train.csv',usecols=["id", "keyword", "text", "target"],dtype={'id':'int32','target':'int8'})
test = pd.read_csv('../data/test.csv',usecols=["id", "keyword", "text"],dtype={'id':'int32'})
sample_submission = pd.read_csv('../data/sample_submission.csv')

In [175]:
train.keyword.fillna('other',inplace=True)

# quito el numero 20 que aparece en la keyword
train.loc[:,'keyword'] = train['keyword'].apply(lambda s: re.sub('20', ' ', s))

In [176]:
test.keyword.fillna('other',inplace=True)

# quito el numero 20 que aparece en la keyword
test.loc[:,'keyword'] = train['keyword'].apply(lambda s: re.sub('20', ' ', s))

In [177]:
train.text = train.text.astype(str)
train.keyword = train.keyword.astype(str)

In [178]:
test.text = test.text.astype(str)
test.keyword = test.keyword.astype(str)

In [179]:
#traduzco los pocos casos que no estaban en ingles
train.text = train.text.str.replace('Acesse nosso site para ouvir','Visit our website to listen')
train.text = train.text.str.replace('quem lembra','who remembers')
train.text = train.text.str.replace('Sismo DETECTADO','Earthquake DETECTED')

# Agregamos features que ya hemos creado para analisis de tp1

In [180]:
def get_list_length(x):
    return len(x)

In [181]:
def to_lowercase(x):
    return x.lower()

In [182]:
def extract_hashtags(x):
    return re.findall(r'#\w+', x)

In [183]:
def extract_tags(x):
    return re.findall(r'@\w+', x)

In [184]:
def extract_links(x):
    return re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', x)

In [185]:
def clean_text(text):
    # Remove line breaks
    text = re.sub(r'\n',' ', text) 
    
    # Remove leading, trailing, and extra spaces
    text = re.sub('\s+', ' ', text).strip() 
    
    #abreviations
    text = re.sub(r"w/e", "whatever", text)
    text = re.sub(r"w/", "with", text)
    
    #HTML stuff
    text = re.sub(r"&gt;", ">", text)
    text = re.sub(r"&lt;", "<", text)
    text = re.sub(r"&amp;", "&", text)
    
    #annoying characters
    text = re.sub(r"\x89Û_", "", text)
    text = re.sub(r"\x89ÛÒ", "", text)
    text = re.sub(r"\x89ÛÓ", "", text)
    text = re.sub(r"\x89ÛÏ", "", text)
    text = re.sub(r"\x89Û÷", "", text)
    text = re.sub(r"\x89Ûª", "", text)
    text = re.sub(r"\x89Û\x9d", "", text)
    text = re.sub(r"å_", "", text)
    text = re.sub(r"\x89Û¢", "", text)
    text = re.sub(r"\x89Û¢åÊ", "", text)
    text = re.sub(r"åÊ", "", text)
    text = re.sub(r"åÈ", "", text)
    text = re.sub(r"Ì©", "e", text)
    text = re.sub(r"å¨", "", text)
    text = re.sub(r"åÇ", "", text)
    text = re.sub(r"åÀ", "", text)
    
    #punctuactions
    text = text.replace('...', ' ... ')
    if '...' not in text:
        text = text.replace('..', ' ... ')
    
    return text

In [186]:
train.insert(loc=train.shape[1] - 1, column='text_clean', value = train['text'].apply(lambda x: clean_text(x)))
test.insert(loc=test.shape[1], column='text_clean', value = test['text'].apply(lambda x: clean_text(x)))

In [187]:
def create_peculiar_features(df, pos):
    df_temp = df.copy()
    
    #tags
    df_temp['tags'] = df_temp['text_clean'].apply(extract_tags)
    df.insert(loc=df.shape[1] - pos, column='tags_count', value = df_temp['tags'].apply(get_list_length))
    
    #links
    df_temp['links'] = df_temp['text_clean'].apply(extract_tags)
    df.insert(loc=df.shape[1] - pos, column='links_count', value = df_temp['links'].apply(get_list_length))
    
    #hashtags
    df_temp['text_clean'] = df_temp['text_clean'].apply(to_lowercase)
    df_temp['hashtags'] = df_temp['text_clean'].apply(extract_hashtags)
    df.insert(loc=df.shape[1] - pos, column='hashtags_count', value = df_temp['hashtags'].apply(get_list_length))
    
    del(df_temp)
    
    return df

In [188]:
def create_common_numerical_features(df,pos):
    # Tweet length
    df.insert(loc=df.shape[1] - pos, column='text_len', value = df['text'].apply(len))
    
    # Word count
    df.insert(loc=df.shape[1] - pos, column='word_count', value = df['text'].apply(lambda x: len(str(x).split())))
    
    # Stopword count
    df.insert(loc=df.shape[1] - pos, column='stop_word_count', value = \
              df['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS])))
    
    # Punctuation count
    df.insert(loc=df.shape[1] - pos, column='punctuation_count', value = \
              df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation])))

    # Count of uppercase letters
    df.insert(loc=df.shape[1] - pos, column='caps_count', value=\
              df['text'].apply(lambda x: sum(1 for c in str(x) if c.isupper())))
    
    # Ratio of uppercase letters
    df.insert(loc=df.shape[1] - pos, column='caps_ratio', value = df['caps_count'] / df['text_len'])
    #df['caps_ratio'] = df['caps_count'] / df['text_len']
    return df

In [189]:
train = create_peculiar_features(train,1)
train = create_common_numerical_features(train,1)

In [190]:
test = create_peculiar_features(test,0)
test = create_common_numerical_features(test,0)

In [191]:
mi_lista =["building burning","buildings burning", "buildings on fire","buldings","flames","bush fire","wildfire","hellfire","fire","truck fire","wild fires","bush fires","forest fires","arson","arsonist","burning","ablaze","blazing","blaze","burned"\
,"accident","airplane accident","bridge collapse","oil spill","collapse","electrocuted","electrocute","cliff fall","traped","collapsed","crashed","crushed","crush","wrecked","wreckage","wreck","collide","collided","collision","crash"\
,"apocalypse","armageddon","annihilated","annihilation","catastrophic","famine","ruin","catastrophe","razed","devastation","disaster","heat wave","pandemonium","destruction","desolation","desolate","destroyed","destroy","blight","demolition"\
,"terrorism","bioterror","bioterrorism","terrorist","threat","hijacker","hijacking","mass murder","mass murderer","massacre","massac","hostage","attack","hijack","attacked","detonate","suicide bomber","suicide bomb","blown up","suicide bombing","blew up", "blow up","bombing","bomb","bombed","exploded","explosion","explode"\
,"fatality","fatal","fatalities","casualty","casualties","deaths","death","drown","drowned", "drowning","drownet","tragedy","trauma","traumatised","blood","bleeding","dead","bloody","body bagging","body bag","body bags"\
,"thunderstorm","storm","twister","typhoon","hurricane","tornado","windstorm","rainstorm","sandstorm","hailstorm","hail","earthquake","flooding","mudslide","seismic","floods","inundated","inundation","landslide","lava","lightning","flood","thunder","tsunami","cyclone","deluge","snowstorm","avalanche","blizzard","whirlwind","volcano","drought"\
,"survivors","survivor","survive","survived","refugees","rescue","rescued","rescuers","wounded","deluged","derailed","flattened","harm"\
,"army","battle","military","police","weapons","weapon","war zone","hostages","mayhem","detonation","devastated","displaced","first responders","eyewitness","engulfed","injured","injuries","injury","obliterate","obliterated","obliteration"\
,"danger","damage","curfew","debris","trouble","demolish","meltdown","aftershock","nuclear reactor","structural failure","smoke","rubble","sinking","sinkhole","hazardous","hazard","fear","demolished","derail","derailet","derailment","epicentre","loud bang","rioting","riot"\
,"emergency","services emergency","plan emergency","ambulance","chemical emergency","evacuate","upheaval","sirens","wounds","siren","stretcher","evacuated","evacuation","outbreak","panic","screamed","screaming","screams","panicking"\
,"quarantine","quarantined"]

def encuentra_key(tweet): 
    for s in tweet:
        if s in mi_lista:
            return s
    return 'other'    

key_aux = train.loc[train['keyword']=='other',['keyword','text_clean']]
key_aux['text_clean'] = key_aux['text_clean'].map(lambda x: x.lower())
key_aux['text_clean'] = key_aux['text_clean'].map(lambda x: x.split())
key_aux['keyword']= key_aux['text_clean'].map(encuentra_key)

In [192]:
train.loc[train['keyword']=='other','keyword'] = key_aux['keyword']

In [193]:
mis_kw = train['keyword']

In [194]:
keyword_processor = KeywordProcessor()
keyword_dict = {"Burning": ["building burning","buildings burning", "buildings on fire","buldings","flames","bush fire","wildfire","hellfire","fire","truck fire","wild fires","wild% fires","bush% fires","bush fires","forest% fires","forest fires","arson","arsonist","burning","ablaze","blazing","blaze","burned"]\
               ,"Accident": ["accident","airplane accident","bridge collapse","oil spill","oil% spill","collapse","electrocuted","electrocute","cliff fall","cliff% fall","traped","trapped","collapsed","crashed","crushed","crush","wrecked","wreckage","wreck","collide","collided","collision","crash"]\
               ,"Apocalypse": ["apocalypse","armageddon","annihilated","annihilation","catastrophic","famine","ruin","catastrophe","razed","devastation","disaster","heat% wave","heat wave","pandemonium","destruction","desolation","desolate","destroyed","destroy","blight","demolition"]\
               ,"Attack": ["terrorism","bioterror","bioterrorism","terrorist","threat","hijacker","hijacking","mass% murderer","mass% murder","mass murder","mass murderer","massacre","massac","hostage","attack","hijack","attacked","detonate","suicide% bomber","suicide bomber","suicide bomb","blown up","blown% up","suicide bombing","blew% up","blew up", "blow up","bombing","bomb","bombed","exploded","explosion","explode"]\
               ,"Fatality": ["fatality","fatal","fatalities","casualty","casualties","deaths","death","drown","drowned", "drowning","drownet","tragedy","trauma","traumatised","blood","bleeding","dead","bloody","body bagging","body bag","body% bag","body bags","body% bagging","body% bags","sunk"]\
               ,"Natural Phenomenon": ["thunderstorm","storm","twister","typhoon","hurricane","tornado","windstorm","rainstorm","sandstorm","hailstorm","hail","earthquake","flooding","mudslide","seismic","floods","inundated","inundation","landslide","lava","lightning","flood","thunder","tsunami","cyclone","deluge","snowstorm","avalanche","blizzard","whirlwind","volcano","drought"]\
               ,"Survivor": ["survivors","survivor","survive","survived","refugees","rescue","rescued","rescuers","wounded","deluged","derailed","flattened","harm"]\
               ,"Security": ["army","battle","military","police","weapons","weapon","war zone","war% zone","hostages","mayhem","detonation","devastated","displaced","first responders","first% responders","eyewitness","engulfed","injured","injuries","injury","obliterate","obliterated","obliteration"]\
               ,"Danger": ["danger","damage","curfew","debris","trouble","demolish","meltdown","aftershock","nuclear% reactor","nuclear reactor","structural% failure","structural failure","smoke","rubble","sinking","sinkhole","hazardous","hazard","fear","demolished","derail","derailet","derailment","epicentre","loud% bang","loud bang","rioting","riot"]\
               ,"Emergency": ["emergency","services emergency","plan emergency","ambulance","chemical emergency","evacuate","upheaval","sirens","wounds","siren","stretcher","evacuated","evacuation","outbreak","panic","screamed","screaming","screams","panicking","quarantine","quarantined"]\
               ,"Otro": ["other"]}
key_depuradas=[]
keyword_processor.add_keywords_from_dict(keyword_dict)
for w in mis_kw:
       if len(keyword_processor.extract_keywords(w))==1:
            key_depuradas.append(keyword_processor.extract_keywords(w)[0])
       
       else: 
            key_depuradas.append(w)

In [195]:
train.insert(loc=train.shape[1] - 1, column='key_global', value = key_depuradas)

In [196]:
key_aux = test.loc[test['keyword']=='other',['keyword','text_clean']]
key_aux['text_clean'] = key_aux['text_clean'].map(lambda x: x.lower())
key_aux['text_clean'] = key_aux['text_clean'].map(lambda x: x.split())
key_aux['keyword']= key_aux['text_clean'].map(encuentra_key)

In [197]:
test.loc[test['keyword']=='other','keyword'] = key_aux['keyword']

In [198]:
mis_kw = test['keyword']

In [199]:
keyword_processor = KeywordProcessor()
key_depuradas=[]
keyword_processor.add_keywords_from_dict(keyword_dict)
for w in mis_kw:
       if len(keyword_processor.extract_keywords(w))==1:
            key_depuradas.append(keyword_processor.extract_keywords(w)[0])
       
       else: 
            key_depuradas.append(w)

In [200]:
test.insert(loc=test.shape[1], column='key_global', value = key_depuradas)

In [201]:
mi_lista=[]
mi_lista = list(train['text_clean'])
def encuentra_RT(tweet): 
        count = 0
        for i in mi_lista:
            if i==tweet:
                count +=1
            if count >= 2:
                return 1
        else:
            return 0

train.insert(loc=train.shape[1] - 1, column='retweet_bool', value = 0)
train['retweet_bool'] = train['text_clean'].map(encuentra_RT)

In [202]:
mi_lista=[]
mi_lista = list(test['text_clean'])

test.insert(loc=test.shape[1], column='retweet_bool', value = 0)
test['retweet_bool'] = test['text_clean'].map(encuentra_RT)

# * Fin agregado de features de tp1 *

Hagamos la funcion para guardar submissions ahora, para evitar problemas a futuro y despreocuparnos.

In [203]:
# To save predictions.
# There must be a directory ../predictions for this to work as expected.
import time
def _get_filename(my_name, timestamp):
    return "../predictions/" + timestamp + " by " + my_name + ".csv"

def _save_description(authors_name, timestamp, submission_description):
    f = open("../predictions/" + authors_name + ".txt","a")
    f.write(timestamp + ": " + submission_description + '\n')
    f.close()

def save_submission(submission_df, authors_name="fc", description = "no description.", index=False, header=True):
    timestamp = time.strftime("%Y.%m.%d - %H:%M:%S")
    submission_df.to_csv(_get_filename(authors_name, timestamp), index=index, header=header)
    _save_description(authors_name, timestamp, description)

In [204]:
# Define a seed, so all algorithms that accept a seed, take the same, for consistency reasons,
# so everything can be replicated without problems random state
seed=42

In [205]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train.drop('target', axis=1), train['target'], test_size=0.33, random_state=seed)

***

# Approach n....

## Entrenamiento local

In [208]:
cat_columns = ['key_global']

num_columns = [
#     'id',
    'tags_count', 'links_count',
    'hashtags_count', 'text_len', 'word_count', 'stop_word_count',
    'punctuation_count', 'caps_count', 'caps_ratio'
    ]

bool_columns = ['retweet_bool']

text_columns = ['keyword',
                'text',
                'text_clean'
               ]

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD

transformers = []

transformers.append(("cat",
                     Pipeline(steps=[
                         ("category_imputer", SimpleImputer(strategy='constant', fill_value="")),
                         ("one_hot", OneHotEncoder(handle_unknown='ignore')),
                         ("svd", TruncatedSVD(n_components=4, n_iter=7, random_state=seed))
                     ]),
                     cat_columns))

#transformaciones para los numeros, simple imputer para los misssing values y standard scaler
#tal que la distribución tenga un valor medio 0 y una desviación estándar de 1.
transformers.append(("num",
                     Pipeline(steps=[
                         ("num_imputer", SimpleImputer(strategy='most_frequent',verbose=1)),
                         ("num_transformer", StandardScaler())
                     ]),
                   num_columns))


transformers.append(("bool",
                    Pipeline(steps=[
                        ("bool_imputer", SimpleImputer(strategy='most_frequent')),
                    ]),
                     bool_columns))

# The reason this for is necessary is because text transformers take an array-like parameter.
# If we pass a list of columns, then the transformer will receive a dataframe, and that will result in error.
# If you don't want to process all the text columns with the same pipeline, you'll have to define
# a different pipelines for each, and pass a different list for each of the pipelines.

#hashing vectorizer devuelve una matriz de un texto convertido y la SVD reduce su dimension para poder trabajarlo
for col in text_columns:
    # First, fill empty texts with an empty string.
    X_train[col] = X_train[col].fillna("")
    X_test[col] = X_test[col].fillna("")
    train[col] = train[col].fillna("")
    test[col] = test[col].fillna("")
    transformer_name = "text_" + col
    transformers.append((transformer_name,
                        Pipeline(steps=[
                            ("hashing_vectorizer", HashingVectorizer(decode_error='replace', strip_accents='ascii')),
                            ("svd", TruncatedSVD(n_components=20, n_iter=7, random_state=seed))
                        ]),
                         col))

my_col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0.3, 
                                       n_jobs=-1, 
                                       transformer_weights=None)

steps = []

steps.append(("col_trans", my_col_transformer))


#algoritmo a usar, pongo un regressor a modo de ejemplo pero hay que usar un classifier
from sklearn.linear_model import LogisticRegression
steps.append(("lr", LogisticRegression(solver='liblinear', random_state=seed)))

my_pipe = Pipeline(steps, verbose=True)

my_pipe.fit(X_train, y_train)

y_scores = my_pipe.predict(X_test)

#metrica a utilizar
from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(y_test, y_scores))

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  16.9s
[Pipeline] ................ (step 2 of 2) Processing lr, total=   0.1s
0.2793473935535217


In [209]:
beep()

## Entrenamiento con todos los datos para obtener predicciones a subir

In [210]:
X = train.drop(['target'], axis=1) #set de datos
y = train['target'] #target

In [211]:
my_pipe.fit(X,y)

# prediciendo valores
predictions = my_pipe.predict(test)

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  14.9s
[Pipeline] ................ (step 2 of 2) Processing lr, total=   0.1s


In [281]:
df_predictions = pd.DataFrame(data={'id':test['id'], 'target':predictions})

In [284]:
description = "1st simple_approach. LinearRegressor"
save_submission(df_predictions, description=description)

In [285]:
beep()

## K folds en nuestro train set

In [287]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3, shuffle=True, random_state=seed)

df = pd.DataFrame([])

# UPDATE THIS VALUE
approach_numer = "username_approach_n"

for train_index, test_index in kf.split(train):
    X = train.drop(['target'], axis=1) #set de datos
    y = train['target'] #target
    # for loop copied from docs: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold
    X_train2, X_test2 = X.iloc[train_index], X.iloc[test_index]
    y_train2, y_test2 = y[train_index], y[test_index]
    
    my_pipe.fit(X_train2, y_train2)
    y_scores = my_pipe.predict(X_test2)
    
    print(mean_absolute_error(y_test2, y_scores))
    
    df = df.append(pd.DataFrame(data={'id':X_test2['id'], approach_numer:y_scores}))

df.to_csv("../predictions/on_train_data/" + approach_numer + ".csv", index=False, header=True)

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  15.2s
[Pipeline] ................ (step 2 of 2) Processing lr, total=   0.0s
0.30102442868400314
[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  14.8s
[Pipeline] ................ (step 2 of 2) Processing lr, total=   0.0s
0.28329393223010246
[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  14.7s
[Pipeline] ................ (step 2 of 2) Processing lr, total=   0.0s
0.29168309026409145


***