In [2]:
import pandas as pd
import numpy as np
import beepy
import re
import string
from wordcloud import STOPWORDS

In [3]:
# This will allow you to see all column names & rows when you are doing .head(). None of the column name will be truncated.
# source: https://stackoverflow.com/questions/49188960/how-to-show-all-of-columns-name-on-pandas-dataframe

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [4]:
train = pd.read_csv("data/train.csv",usecols=["id", "keyword", "text", "target"],dtype={'id':'int32','target':'int8'})
test = pd.read_csv('data/test.csv',usecols=["id", "keyword", "text"],dtype={'id':'int32'})
sample_submission = pd.read_csv('data/sample_submission.csv')

In [5]:
train.text = train.text.astype(str)
train.keyword = train.keyword.astype(str)

In [6]:
test.text = test.text.astype(str)
test.keyword = test.keyword.astype(str)

In [7]:
#traduzco los pocos casos que no estaban en ingles
train.text = train.text.str.replace('Acesse nosso site para ouvir','Visit our website to listen')
train.text = train.text.str.replace('quem lembra','who remembers')
train.text = train.text.str.replace('Sismo DETECTADO','Earthquake DETECTED')

# Agregamos features que ya hemos creado para analisis de tp1

In [8]:
def get_list_length(x):
    return len(x)

In [9]:
def to_lowercase(x):
    return x.lower()

In [10]:
def extract_hashtags(x):
    return re.findall(r'#\w+', x)

In [11]:
def extract_tags(x):
    return re.findall(r'@\w+', x)

In [12]:
def extract_links(x):
    return re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', x)

In [13]:
def clean_text(text):
    text = re.sub(r'\n',' ', text) # Remove line breaks
    text = re.sub('\s+', ' ', text).strip() # Remove leading, trailing, and extra spaces
    return text

In [14]:
train.insert(loc=train.shape[1] - 1, column='text_clean', value = train['text'].apply(lambda x: clean_text(x)))
test.insert(loc=test.shape[1], column='text_clean', value = test['text'].apply(lambda x: clean_text(x)))

In [15]:
#para que tengan el mismo orden repito esta funcion ya que el shape cambia segun tiene el target o no
#es la diferencia entre test y train
def create_peculiar_features(df, pos):
    df_temp = df.copy()
    
    #tags
    df_temp['tags'] = df_temp['text_clean'].apply(extract_tags)
    df.insert(loc=df.shape[1] - pos, column='tags_count', value = df_temp['tags'].apply(get_list_length))
    
    #links
    df_temp['links'] = df_temp['text_clean'].apply(extract_tags)
    df.insert(loc=df.shape[1] - pos, column='links_count', value = df_temp['links'].apply(get_list_length))
    
    #hashtags
    df_temp['text_clean'] = df_temp['text_clean'].apply(to_lowercase)
    df_temp['hashtags'] = df_temp['text_clean'].apply(extract_hashtags)
    df.insert(loc=df.shape[1] - pos, column='hashtags_count', value = df_temp['hashtags'].apply(get_list_length))
    
    del(df_temp)
    
    return df

In [16]:
def create_common_numerical_features(df,pos):
    # Tweet length
    df.insert(loc=df.shape[1] - pos, column='text_len', value = df['text'].apply(len))
    
    # Word count
    df.insert(loc=df.shape[1] - pos, column='word_count', value = df['text'].apply(lambda x: len(str(x).split())))
    
    # Stopword count
    df.insert(loc=df.shape[1] - pos, column='stop_word_count', value = \
              df['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS])))
    
    # Punctuation count
    df.insert(loc=df.shape[1] - pos, column='punctuation_count', value = \
              df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation])))

    # Count of uppercase letters
    df.insert(loc=df.shape[1] - pos, column='caps_count', value=\
              df['text'].apply(lambda x: sum(1 for c in str(x) if c.isupper())))
    
    # Ratio of uppercase letters
    df.insert(loc=df.shape[1] - pos, column='caps_ratio', value = df['caps_count'] / df['text_len'])
    #df['caps_ratio'] = df['caps_count'] / df['text_len']
    return df

In [17]:
train = create_peculiar_features(train,1)
train = create_common_numerical_features(train,1)

In [18]:
test = create_peculiar_features(test,0)
test = create_common_numerical_features(test,0)

# * Fin agregado de features de tp1 *

Hagamos la funcion para guardar submissions ahora, para evitar problemas a futuro y despreocuparnos.

In [19]:
# To save predictions.
# There must be a directory ../predictions for this to work as expected.
import time
def _get_filename(my_name, timestamp):
    return "../predictions/" + timestamp + " by " + my_name + ".csv"

def _save_description(authors_name, timestamp, submission_description):
    f = open("../predictions/" + authors_name + ".txt","a")
    f.write(timestamp + ": " + submission_description + '\n')
    f.close()

def save_submission(submission_df, authors_name="LLL", description = "no description.", index=False, header=True):
    timestamp = time.strftime("%Y.%m.%d - %H:%M:%S")
    submission_df.to_csv(_get_filename(authors_name, timestamp), index=index, header=header)
    _save_description(authors_name, timestamp, description)

In [20]:
# Define a seed, so all algorithms that accept a seed, take the same, for consistency reasons,
# so everything can be replicated without problems random state
seed=42

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train.drop('target', axis=1), train['target'], test_size=0.33, random_state=seed)

***

# Approach nº1 - Label Propagation - kernel rbf

In [142]:
from sklearn.semi_supervised import LabelPropagation

## Entrenamiento local

In [143]:
num_columns = [
#     'id',
    'tags_count', 'links_count',
    'hashtags_count', 'text_len', 'word_count', 'stop_word_count',
    'punctuation_count', 'caps_count', 'caps_ratio'
    ]

text_columns = ['keyword',
                'text',
                'text_clean'
               ]

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD

transformers = []

#transformaciones para los numeros, simple imputer para los misssing values y standard scaler
#tal que la distribución tenga un valor medio 0 y una desviación estándar de 1.
transformers.append(("num",
                     Pipeline(steps=[
                         ("num_imputer", SimpleImputer(strategy='most_frequent',verbose=1)),
                         ("num_transformer", StandardScaler())
                     ]),
                   num_columns))

# The reason this for is necessary is because text transformers take an array-like parameter.
# If we pass a list of columns, then the transformer will receive a dataframe, and that will result in error.
# If you don't want to process all the text columns with the same pipeline, you'll have to define
# a different pipelines for each, and pass a different list for each of the pipelines.

#hashing vectorizer devuelve una matriz de un texto convertido y la SVD reduce su dimension para poder trabajarlo
for col in text_columns:
    # First, fill empty texts with an empty string.
    X_train[col] = X_train[col].fillna("")
    X_test[col] = X_test[col].fillna("")
    train[col] = train[col].fillna("")
    test[col] = test[col].fillna("")
    transformer_name = "text_" + col
    transformers.append((transformer_name,
                        Pipeline(steps=[
                            ("hashing_vectorizer", HashingVectorizer(decode_error='replace', strip_accents='ascii')),
                            ("svd", TruncatedSVD(n_components=20, n_iter=7, random_state=seed))
                        ]),
                         col))

my_col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0.3, 
                                       n_jobs=-1, 
                                       transformer_weights=None)

steps = []

steps.append(("col_trans", my_col_transformer))


#algoritmo a usar, pongo un regressor a modo de ejemplo pero hay que usar un classifier
from sklearn.linear_model import LogisticRegression
label_propagation_model = LabelPropagation(kernel='rbf')
steps.append(("LP", label_propagation_model))

my_pipe = Pipeline(steps, verbose=True)

my_pipe.fit(X_train, y_train)

y_scores = my_pipe.predict(X_test)

#metrica a utilizar
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix,mean_absolute_error
print("F1 Score: " + str(f1_score(y_test, y_scores, average="macro")))
print("Precision: " + str(precision_score(y_test, y_scores, average="macro")))
print("Recall: " + str(recall_score(y_test, y_scores, average="macro")))

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  22.7s
[Pipeline] ................ (step 2 of 2) Processing LP, total=   0.6s
F1 Score: 0.6333432276006985
Precision: 0.633265080737879
Recall: 0.6334266003492166


In [144]:
os.system('say "Terminé gato."')

0

## Entrenamiento con todos los datos para obtener predicciones a subir

In [145]:
X = train.drop(['target'], axis=1) #set de datos
y = train['target'] #target

In [146]:
my_pipe.fit(X,y)

# prediciendo valores
predictions = my_pipe.predict(test)

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  20.9s
[Pipeline] ................ (step 2 of 2) Processing LP, total=   1.3s


In [147]:
df_predictions = pd.DataFrame(data={'id':test['id'], 'target':predictions})

In [148]:
description = "1st simple_approach. LabelPropagation - kernel rbf"
save_submission(df_predictions, description=description)

In [149]:
os.system('say "Terminé gato."')

0

## K folds en nuestro train set

In [150]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3, shuffle=True, random_state=seed)

df = pd.DataFrame([])

# UPDATE THIS VALUE
approach_numer = "lucioll_approach_1"

for train_index, test_index in kf.split(train):
    X = train.drop(['target'], axis=1) #set de datos
    y = train['target'] #target
    # for loop copied from docs: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold
    X_train2, X_test2 = X.iloc[train_index], X.iloc[test_index]
    y_train2, y_test2 = y[train_index], y[test_index]
    
    my_pipe.fit(X_train2, y_train2)
    y_scores = my_pipe.predict(X_test2)
    
    print(mean_absolute_error(y_test2, y_scores))
    
    df = df.append(pd.DataFrame(data={'id':X_test2['id'], approach_numer:y_scores}))

df.to_csv("../predictions/on_train_data/" + approach_numer + ".csv", index=False, header=True)

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  20.5s
[Pipeline] ................ (step 2 of 2) Processing LP, total=   0.6s
0.3565799842395587
[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  19.8s
[Pipeline] ................ (step 2 of 2) Processing LP, total=   0.6s
0.3459416863672183
[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  20.0s
[Pipeline] ................ (step 2 of 2) Processing LP, total=   0.6s
0.3586913677571935


  probabilities /= normalizer


# Approach nº2 - Label Propagation - kernel KNN

In [21]:
from sklearn.semi_supervised import LabelPropagation

## Entrenamiento local

In [36]:
num_columns = [
#     'id',
    'tags_count', 'links_count',
    'hashtags_count', 'text_len', 'word_count', 'stop_word_count',
    'punctuation_count', 'caps_count', 'caps_ratio'
    ]

text_columns = ['keyword',
                'text',
                'text_clean'
               ]

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD

transformers = []

#transformaciones para los numeros, simple imputer para los misssing values y standard scaler
#tal que la distribución tenga un valor medio 0 y una desviación estándar de 1.
transformers.append(("num",
                     Pipeline(steps=[
                         ("num_imputer", SimpleImputer(strategy='most_frequent',verbose=1)),
                         ("num_transformer", StandardScaler())
                     ]),
                   num_columns))

# The reason this for is necessary is because text transformers take an array-like parameter.
# If we pass a list of columns, then the transformer will receive a dataframe, and that will result in error.
# If you don't want to process all the text columns with the same pipeline, you'll have to define
# a different pipelines for each, and pass a different list for each of the pipelines.

#hashing vectorizer devuelve una matriz de un texto convertido y la SVD reduce su dimension para poder trabajarlo
for col in text_columns:
    # First, fill empty texts with an empty string.
    X_train[col] = X_train[col].fillna("")
    X_test[col] = X_test[col].fillna("")
    train[col] = train[col].fillna("")
    test[col] = test[col].fillna("")
    transformer_name = "text_" + col
    transformers.append((transformer_name,
                        Pipeline(steps=[
                            ("hashing_vectorizer", HashingVectorizer(decode_error='replace', strip_accents='ascii')),
                            ("svd", TruncatedSVD(n_components=20, n_iter=7, random_state=seed))
                        ]),
                         col))

my_col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0.3, 
                                       n_jobs=-1, 
                                       transformer_weights=None)

steps = []

steps.append(("col_trans", my_col_transformer))


#algoritmo a usar, pongo un regressor a modo de ejemplo pero hay que usar un classifier
from sklearn.linear_model import LogisticRegression
label_propagation_model = LabelPropagation(kernel='knn')
steps.append(("LP", label_propagation_model))

my_pipe = Pipeline(steps, verbose=True)

my_pipe.fit(X_train, y_train)

y_scores = my_pipe.predict(X_test)

#metrica a utilizar
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix,mean_absolute_error
print("F1 Score: " + str(f1_score(y_test, y_scores, average="macro")))
print("Precision: " + str(precision_score(y_test, y_scores, average="macro")))
print("Recall: " + str(recall_score(y_test, y_scores, average="macro")))

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  24.0s
[Pipeline] ................ (step 2 of 2) Processing LP, total=   0.7s
F1 Score: 0.6591500943743005
Precision: 0.6648008592433072
Recall: 0.657527276875354


In [37]:
os.system('say "Terminé gato."')

0

## Entrenamiento con todos los datos para obtener predicciones a subir

In [38]:
X = train.drop(['target'], axis=1) #set de datos
y = train['target'] #target

In [39]:
my_pipe.fit(X,y)

# prediciendo valores
predictions = my_pipe.predict(test)

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  20.8s
[Pipeline] ................ (step 2 of 2) Processing LP, total=   1.4s


In [40]:
df_predictions = pd.DataFrame(data={'id':test['id'], 'target':predictions})

In [41]:
description = "2º simple_approach. LabelPropagation with kernel KNN"
save_submission(df_predictions, description=description)

In [42]:
os.system('say "Terminé gato."')

0

## K folds en nuestro train set

In [43]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3, shuffle=True, random_state=seed)

df = pd.DataFrame([])

# UPDATE THIS VALUE
approach_numer = "lucioll_approach_2"

for train_index, test_index in kf.split(train):
    X = train.drop(['target'], axis=1) #set de datos
    y = train['target'] #target
    # for loop copied from docs: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold
    X_train2, X_test2 = X.iloc[train_index], X.iloc[test_index]
    y_train2, y_test2 = y[train_index], y[test_index]
    
    my_pipe.fit(X_train2, y_train2)
    y_scores = my_pipe.predict(X_test2)
    
    print(mean_absolute_error(y_test2, y_scores))
    
    df = df.append(pd.DataFrame(data={'id':X_test2['id'], approach_numer:y_scores}))

df.to_csv("../predictions/on_train_data/" + approach_numer + ".csv", index=False, header=True)

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  20.7s
[Pipeline] ................ (step 2 of 2) Processing LP, total=   0.6s
0.3262411347517731
[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  19.9s
[Pipeline] ................ (step 2 of 2) Processing LP, total=   0.7s
0.314026792750197
[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  19.9s
[Pipeline] ................ (step 2 of 2) Processing LP, total=   0.8s
0.34213638155301535


***

# Hagamos un poco de busqueda de hiper parametros

In [160]:
from sklearn.model_selection import GridSearchCV

num_columns = [
#     'id',
    'tags_count', 'links_count',
    'hashtags_count', 'text_len', 'word_count', 'stop_word_count',
    'punctuation_count', 'caps_count', 'caps_ratio'
    ]

text_columns = ['keyword',
                'text',
                'text_clean'
               ]

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD

transformers = []

#transformaciones para los numeros, simple imputer para los misssing values y standard scaler
#tal que la distribución tenga un valor medio 0 y una desviación estándar de 1.
transformers.append(("num",
                     Pipeline(steps=[
                         ("num_imputer", SimpleImputer(strategy='most_frequent',verbose=1)),
                         ("num_transformer", StandardScaler())
                     ]),
                   num_columns))

# The reason this for is necessary is because text transformers take an array-like parameter.
# If we pass a list of columns, then the transformer will receive a dataframe, and that will result in error.
# If you don't want to process all the text columns with the same pipeline, you'll have to define
# a different pipelines for each, and pass a different list for each of the pipelines.

#hashing vectorizer devuelve una matriz de un texto convertido y la SVD reduce su dimension para poder trabajarlo
for col in text_columns:
    # First, fill empty texts with an empty string.
    X_train[col] = X_train[col].fillna("")
    X_test[col] = X_test[col].fillna("")
    train[col] = train[col].fillna("")
    test[col] = test[col].fillna("")
    transformer_name = "text_" + col
    transformers.append((transformer_name,
                        Pipeline(steps=[
                            ("hashing_vectorizer", HashingVectorizer(decode_error='replace', strip_accents='ascii')),
                            ("svd", TruncatedSVD(n_components=20, n_iter=7, random_state=seed))
                        ]),
                         col))

my_col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0.3, 
                                       n_jobs=-1, 
                                       transformer_weights=None)

steps = []

steps.append(("col_trans", my_col_transformer))

lbp = LabelPropagation()

param_dist = {"kernel": ["rbf", "knn"],
              'gamma': [10,15,20,25,30,50],
              "n_neighbors": [2,3,5,7,9,11,13]
             }

grid_search = GridSearchCV(lbp, n_jobs=-1, param_grid=param_dist, cv = 3, scoring="neg_mean_absolute_error", verbose=5)

steps.append(("grid-search", grid_search))

my_pipeline2 = Pipeline(steps, verbose=True)

my_pipeline2.fit(X_train, y_train)
grid_search.best_estimator_

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  19.8s
Fitting 3 folds for each of 84 candidates, totalling 252 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   13.9s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   35.4s
[Parallel(n_jobs=-1)]: Done 252 out of 252 | elapsed:   59.6s finished


[Pipeline] ....... (step 2 of 2) Processing grid-search, total= 1.0min


LabelPropagation(gamma=10, kernel='knn', max_iter=1000, n_jobs=None,
                 n_neighbors=9, tol=0.001)

In [161]:
os.system('say "Terminé gatoooooooo."')

0

# Approach nº3 - Label Propagation - luego del grid search de parametros

In [162]:
from sklearn.semi_supervised import LabelPropagation

## Entrenamiento local

In [163]:
num_columns = [
#     'id',
    'tags_count', 'links_count',
    'hashtags_count', 'text_len', 'word_count', 'stop_word_count',
    'punctuation_count', 'caps_count', 'caps_ratio'
    ]

text_columns = ['keyword',
                'text',
                'text_clean'
               ]

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD

transformers = []

#transformaciones para los numeros, simple imputer para los misssing values y standard scaler
#tal que la distribución tenga un valor medio 0 y una desviación estándar de 1.
transformers.append(("num",
                     Pipeline(steps=[
                         ("num_imputer", SimpleImputer(strategy='most_frequent',verbose=1)),
                         ("num_transformer", StandardScaler())
                     ]),
                   num_columns))

# The reason this for is necessary is because text transformers take an array-like parameter.
# If we pass a list of columns, then the transformer will receive a dataframe, and that will result in error.
# If you don't want to process all the text columns with the same pipeline, you'll have to define
# a different pipelines for each, and pass a different list for each of the pipelines.

#hashing vectorizer devuelve una matriz de un texto convertido y la SVD reduce su dimension para poder trabajarlo
for col in text_columns:
    # First, fill empty texts with an empty string.
    X_train[col] = X_train[col].fillna("")
    X_test[col] = X_test[col].fillna("")
    train[col] = train[col].fillna("")
    test[col] = test[col].fillna("")
    transformer_name = "text_" + col
    transformers.append((transformer_name,
                        Pipeline(steps=[
                            ("hashing_vectorizer", HashingVectorizer(decode_error='replace', strip_accents='ascii')),
                            ("svd", TruncatedSVD(n_components=20, n_iter=7, random_state=seed))
                        ]),
                         col))

my_col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0.3, 
                                       n_jobs=-1, 
                                       transformer_weights=None)

steps = []

steps.append(("col_trans", my_col_transformer))


#algoritmo a usar, pongo un regressor a modo de ejemplo pero hay que usar un classifier
from sklearn.linear_model import LogisticRegression
label_propagation_model = LabelPropagation(kernel='knn',n_neighbors=9,gamma=10)
steps.append(("LP", label_propagation_model))

my_pipe = Pipeline(steps, verbose=True)

my_pipe.fit(X_train, y_train)

y_scores = my_pipe.predict(X_test)

#metrica a utilizar
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix,mean_absolute_error
print("F1 Score: " + str(f1_score(y_test, y_scores, average="macro")))
print("Precision: " + str(precision_score(y_test, y_scores, average="macro")))
print("Recall: " + str(recall_score(y_test, y_scores, average="macro")))

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  20.1s
[Pipeline] ................ (step 2 of 2) Processing LP, total=   0.7s
F1 Score: 0.6690880612737915
Precision: 0.6745199600879364
Recall: 0.6673225820250674


In [164]:
os.system('say "Terminé gato."')

0

## Entrenamiento con todos los datos para obtener predicciones a subir

In [165]:
X = train.drop(['target'], axis=1) #set de datos
y = train['target'] #target

In [166]:
my_pipe.fit(X,y)

# prediciendo valores
predictions = my_pipe.predict(test)

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  22.8s
[Pipeline] ................ (step 2 of 2) Processing LP, total=   1.5s


In [167]:
df_predictions = pd.DataFrame(data={'id':test['id'], 'target':predictions})

In [168]:
description = "3º simple_approach. LabelPropagation - best hiper parameters"
save_submission(df_predictions, description=description)

In [169]:
os.system('say "Terminé gato."')

0

## K folds en nuestro train set

In [170]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3, shuffle=True, random_state=seed)

df = pd.DataFrame([])

# UPDATE THIS VALUE
approach_numer = "lucioll_approach_3"

for train_index, test_index in kf.split(train):
    X = train.drop(['target'], axis=1) #set de datos
    y = train['target'] #target
    # for loop copied from docs: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold
    X_train2, X_test2 = X.iloc[train_index], X.iloc[test_index]
    y_train2, y_test2 = y[train_index], y[test_index]
    
    my_pipe.fit(X_train2, y_train2)
    y_scores = my_pipe.predict(X_test2)
    
    print(mean_absolute_error(y_test2, y_scores))
    
    df = df.append(pd.DataFrame(data={'id':X_test2['id'], approach_numer:y_scores}))

df.to_csv("../predictions/on_train_data/" + approach_numer + ".csv", index=False, header=True)

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  25.3s
[Pipeline] ................ (step 2 of 2) Processing LP, total=   0.7s
0.3191489361702128
[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  20.8s
[Pipeline] ................ (step 2 of 2) Processing LP, total=   0.8s
0.31008668242710796
[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  22.0s
[Pipeline] ................ (step 2 of 2) Processing LP, total=   1.0s
0.3318880567599527


# Approach nº4 - Label Spreading - kernel rbf

In [1]:
from sklearn.semi_supervised import LabelSpreading

## Entrenamiento local

In [23]:
num_columns = [
#     'id',
    'tags_count', 'links_count',
    'hashtags_count', 'text_len', 'word_count', 'stop_word_count',
    'punctuation_count', 'caps_count', 'caps_ratio'
    ]

text_columns = ['keyword',
                'text',
                'text_clean'
               ]

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD

transformers = []

#transformaciones para los numeros, simple imputer para los misssing values y standard scaler
#tal que la distribución tenga un valor medio 0 y una desviación estándar de 1.
transformers.append(("num",
                     Pipeline(steps=[
                         ("num_imputer", SimpleImputer(strategy='most_frequent',verbose=1)),
                         ("num_transformer", StandardScaler())
                     ]),
                   num_columns))

# The reason this for is necessary is because text transformers take an array-like parameter.
# If we pass a list of columns, then the transformer will receive a dataframe, and that will result in error.
# If you don't want to process all the text columns with the same pipeline, you'll have to define
# a different pipelines for each, and pass a different list for each of the pipelines.

#hashing vectorizer devuelve una matriz de un texto convertido y la SVD reduce su dimension para poder trabajarlo
for col in text_columns:
    # First, fill empty texts with an empty string.
    X_train[col] = X_train[col].fillna("")
    X_test[col] = X_test[col].fillna("")
    train[col] = train[col].fillna("")
    test[col] = test[col].fillna("")
    transformer_name = "text_" + col
    transformers.append((transformer_name,
                        Pipeline(steps=[
                            ("hashing_vectorizer", HashingVectorizer(decode_error='replace', strip_accents='ascii')),
                            ("svd", TruncatedSVD(n_components=20, n_iter=7, random_state=seed))
                        ]),
                         col))

my_col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0.3, 
                                       n_jobs=-1, 
                                       transformer_weights=None)

steps = []

steps.append(("col_trans", my_col_transformer))


#algoritmo a usar, pongo un regressor a modo de ejemplo pero hay que usar un classifier
from sklearn.linear_model import LogisticRegression
label_spreading_model = LabelSpreading(kernel='rbf')
steps.append(("LSP", label_spreading_model))

my_pipe = Pipeline(steps, verbose=True)

my_pipe.fit(X_train, y_train)

y_scores = my_pipe.predict(X_test)

#metrica a utilizar
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix,mean_absolute_error
print("F1 Score: " + str(f1_score(y_test, y_scores, average="macro")))
print("Precision: " + str(precision_score(y_test, y_scores, average="macro")))
print("Recall: " + str(recall_score(y_test, y_scores, average="macro")))

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  24.8s
[Pipeline] ............... (step 2 of 2) Processing LSP, total=   1.0s
F1 Score: 0.6360590568018349
Precision: 0.6360262393465816
Recall: 0.6360927148025578


In [25]:
import os
os.system('say "Terminé gato."')

0

## Entrenamiento con todos los datos para obtener predicciones a subir

In [26]:
X = train.drop(['target'], axis=1) #set de datos
y = train['target'] #target

In [27]:
my_pipe.fit(X,y)

# prediciendo valores
predictions = my_pipe.predict(test)

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  22.6s
[Pipeline] ............... (step 2 of 2) Processing LSP, total=   2.6s


In [28]:
df_predictions = pd.DataFrame(data={'id':test['id'], 'target':predictions})

In [29]:
description = "4º simple_approach. LabelSpreading - kernel rbf"
save_submission(df_predictions, description=description)

In [30]:
os.system('say "Terminé gato."')

0

## K folds en nuestro train set

In [31]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3, shuffle=True, random_state=seed)

df = pd.DataFrame([])

# UPDATE THIS VALUE
approach_numer = "lucioll_approach_4"

for train_index, test_index in kf.split(train):
    X = train.drop(['target'], axis=1) #set de datos
    y = train['target'] #target
    # for loop copied from docs: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold
    X_train2, X_test2 = X.iloc[train_index], X.iloc[test_index]
    y_train2, y_test2 = y[train_index], y[test_index]
    
    my_pipe.fit(X_train2, y_train2)
    y_scores = my_pipe.predict(X_test2)
    
    print(mean_absolute_error(y_test2, y_scores))
    
    df = df.append(pd.DataFrame(data={'id':X_test2['id'], approach_numer:y_scores}))

df.to_csv("../predictions/on_train_data/" + approach_numer + ".csv", index=False, header=True)

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  21.4s
[Pipeline] ............... (step 2 of 2) Processing LSP, total=   1.0s
0.3538219070133964
[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  23.0s
[Pipeline] ............... (step 2 of 2) Processing LSP, total=   1.1s
0.3459416863672183
[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  22.3s
[Pipeline] ............... (step 2 of 2) Processing LSP, total=   1.0s
0.35672053606621995


  probabilities /= normalizer


# Approach nº5 - Label Spreading - kernel KNN

In [22]:
from sklearn.semi_supervised import LabelSpreading

## Entrenamiento local

In [32]:
num_columns = [
#     'id',
    'tags_count', 'links_count',
    'hashtags_count', 'text_len', 'word_count', 'stop_word_count',
    'punctuation_count', 'caps_count', 'caps_ratio'
    ]

text_columns = ['keyword',
                'text',
                'text_clean'
               ]

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD

transformers = []

#transformaciones para los numeros, simple imputer para los misssing values y standard scaler
#tal que la distribución tenga un valor medio 0 y una desviación estándar de 1.
transformers.append(("num",
                     Pipeline(steps=[
                         ("num_imputer", SimpleImputer(strategy='most_frequent',verbose=1)),
                         ("num_transformer", StandardScaler())
                     ]),
                   num_columns))

# The reason this for is necessary is because text transformers take an array-like parameter.
# If we pass a list of columns, then the transformer will receive a dataframe, and that will result in error.
# If you don't want to process all the text columns with the same pipeline, you'll have to define
# a different pipelines for each, and pass a different list for each of the pipelines.

#hashing vectorizer devuelve una matriz de un texto convertido y la SVD reduce su dimension para poder trabajarlo
for col in text_columns:
    # First, fill empty texts with an empty string.
    X_train[col] = X_train[col].fillna("")
    X_test[col] = X_test[col].fillna("")
    train[col] = train[col].fillna("")
    test[col] = test[col].fillna("")
    transformer_name = "text_" + col
    transformers.append((transformer_name,
                        Pipeline(steps=[
                            ("hashing_vectorizer", HashingVectorizer(decode_error='replace', strip_accents='ascii')),
                            ("svd", TruncatedSVD(n_components=20, n_iter=7, random_state=seed))
                        ]),
                         col))

my_col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0.3, 
                                       n_jobs=-1, 
                                       transformer_weights=None)

steps = []

steps.append(("col_trans", my_col_transformer))


#algoritmo a usar, pongo un regressor a modo de ejemplo pero hay que usar un classifier
from sklearn.linear_model import LogisticRegression
label_spreading_model = LabelSpreading(kernel='knn')
steps.append(("LSP", label_spreading_model))

my_pipe = Pipeline(steps, verbose=True)

my_pipe.fit(X_train, y_train)

y_scores = my_pipe.predict(X_test)

#metrica a utilizar
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix,mean_absolute_error
print("F1 Score: " + str(f1_score(y_test, y_scores, average="macro")))
print("Precision: " + str(precision_score(y_test, y_scores, average="macro")))
print("Recall: " + str(recall_score(y_test, y_scores, average="macro")))

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  22.1s
[Pipeline] ............... (step 2 of 2) Processing LSP, total=   0.7s
F1 Score: 0.6583197657369662
Precision: 0.6639488712260626
Recall: 0.6567128918478535


In [33]:
os.system('say "Terminé gato."')

0

## Entrenamiento con todos los datos para obtener predicciones a subir

In [34]:
X = train.drop(['target'], axis=1) #set de datos
y = train['target'] #target

In [35]:
my_pipe.fit(X,y)

# prediciendo valores
predictions = my_pipe.predict(test)

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  22.4s
[Pipeline] ............... (step 2 of 2) Processing LSP, total=   1.4s


In [36]:
df_predictions = pd.DataFrame(data={'id':test['id'], 'target':predictions})

In [37]:
description = "5º simple_approach. LabelSpreading with kernel KNN"
save_submission(df_predictions, description=description)

In [38]:
os.system('say "Terminé gato."')

0

## K folds en nuestro train set

In [39]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3, shuffle=True, random_state=seed)

df = pd.DataFrame([])

# UPDATE THIS VALUE
approach_numer = "lucioll_approach_5"

for train_index, test_index in kf.split(train):
    X = train.drop(['target'], axis=1) #set de datos
    y = train['target'] #target
    # for loop copied from docs: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold
    X_train2, X_test2 = X.iloc[train_index], X.iloc[test_index]
    y_train2, y_test2 = y[train_index], y[test_index]
    
    my_pipe.fit(X_train2, y_train2)
    y_scores = my_pipe.predict(X_test2)
    
    print(mean_absolute_error(y_test2, y_scores))
    
    df = df.append(pd.DataFrame(data={'id':X_test2['id'], approach_numer:y_scores}))

df.to_csv("../predictions/on_train_data/" + approach_numer + ".csv", index=False, header=True)

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  24.0s
[Pipeline] ............... (step 2 of 2) Processing LSP, total=   0.7s
0.3262411347517731
[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  19.8s
[Pipeline] ............... (step 2 of 2) Processing LSP, total=   0.7s
0.3144208037825059
[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  19.9s
[Pipeline] ............... (step 2 of 2) Processing LSP, total=   0.8s
0.3417422152148207


***

# Hagamos un poco de busqueda de hiper parametros

In [42]:
from sklearn.model_selection import GridSearchCV

num_columns = [
#     'id',
    'tags_count', 'links_count',
    'hashtags_count', 'text_len', 'word_count', 'stop_word_count',
    'punctuation_count', 'caps_count', 'caps_ratio'
    ]

text_columns = ['keyword',
                'text',
                'text_clean'
               ]

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD

transformers = []

#transformaciones para los numeros, simple imputer para los misssing values y standard scaler
#tal que la distribución tenga un valor medio 0 y una desviación estándar de 1.
transformers.append(("num",
                     Pipeline(steps=[
                         ("num_imputer", SimpleImputer(strategy='most_frequent',verbose=1)),
                         ("num_transformer", StandardScaler())
                     ]),
                   num_columns))

# The reason this for is necessary is because text transformers take an array-like parameter.
# If we pass a list of columns, then the transformer will receive a dataframe, and that will result in error.
# If you don't want to process all the text columns with the same pipeline, you'll have to define
# a different pipelines for each, and pass a different list for each of the pipelines.

#hashing vectorizer devuelve una matriz de un texto convertido y la SVD reduce su dimension para poder trabajarlo
for col in text_columns:
    # First, fill empty texts with an empty string.
    X_train[col] = X_train[col].fillna("")
    X_test[col] = X_test[col].fillna("")
    train[col] = train[col].fillna("")
    test[col] = test[col].fillna("")
    transformer_name = "text_" + col
    transformers.append((transformer_name,
                        Pipeline(steps=[
                            ("hashing_vectorizer", HashingVectorizer(decode_error='replace', strip_accents='ascii')),
                            ("svd", TruncatedSVD(n_components=20, n_iter=7, random_state=seed))
                        ]),
                         col))

my_col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0.3, 
                                       n_jobs=-1, 
                                       transformer_weights=None)

steps = []

steps.append(("col_trans", my_col_transformer))

lbp = LabelSpreading()

param_dist = {"kernel": ["rbf", "knn"],
              'gamma': [10,15,20,25,30,50],
              "n_neighbors": [2,3,5,7,9,11,13],
              "alpha" : [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]
             }

grid_search = GridSearchCV(lbp, n_jobs=-1, param_grid=param_dist, cv = 3, scoring="neg_mean_absolute_error", verbose=5)

steps.append(("grid-search", grid_search))

my_pipeline2 = Pipeline(steps, verbose=True)

my_pipeline2.fit(X_train, y_train)
grid_search.best_estimator_

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  20.3s
Fitting 3 folds for each of 672 candidates, totalling 2016 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   17.0s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   41.7s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 874 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 1144 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done 2016 out of 2016 | elapsed: 11.9min finished


[Pipeline] ....... (step 2 of 2) Processing grid-search, total=11.9min


LabelSpreading(alpha=0.4, gamma=10, kernel='knn', max_iter=30, n_jobs=None,
               n_neighbors=13, tol=0.001)

In [43]:
os.system('say "Terminé gatoooooooo."')

0

# Approach nº6 - Label Spreading - luego del grid search de parametros

In [162]:
from sklearn.semi_supervised import LabelPropagation

## Entrenamiento local

In [45]:
num_columns = [
#     'id',
    'tags_count', 'links_count',
    'hashtags_count', 'text_len', 'word_count', 'stop_word_count',
    'punctuation_count', 'caps_count', 'caps_ratio'
    ]

text_columns = ['keyword',
                'text',
                'text_clean'
               ]

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD

transformers = []

#transformaciones para los numeros, simple imputer para los misssing values y standard scaler
#tal que la distribución tenga un valor medio 0 y una desviación estándar de 1.
transformers.append(("num",
                     Pipeline(steps=[
                         ("num_imputer", SimpleImputer(strategy='most_frequent',verbose=1)),
                         ("num_transformer", StandardScaler())
                     ]),
                   num_columns))

# The reason this for is necessary is because text transformers take an array-like parameter.
# If we pass a list of columns, then the transformer will receive a dataframe, and that will result in error.
# If you don't want to process all the text columns with the same pipeline, you'll have to define
# a different pipelines for each, and pass a different list for each of the pipelines.

#hashing vectorizer devuelve una matriz de un texto convertido y la SVD reduce su dimension para poder trabajarlo
for col in text_columns:
    # First, fill empty texts with an empty string.
    X_train[col] = X_train[col].fillna("")
    X_test[col] = X_test[col].fillna("")
    train[col] = train[col].fillna("")
    test[col] = test[col].fillna("")
    transformer_name = "text_" + col
    transformers.append((transformer_name,
                        Pipeline(steps=[
                            ("hashing_vectorizer", HashingVectorizer(decode_error='replace', strip_accents='ascii')),
                            ("svd", TruncatedSVD(n_components=20, n_iter=7, random_state=seed))
                        ]),
                         col))

my_col_transformer = ColumnTransformer(transformers, remainder='drop', sparse_threshold=0.3, 
                                       n_jobs=-1, 
                                       transformer_weights=None)

steps = []

steps.append(("col_trans", my_col_transformer))


#algoritmo a usar, pongo un regressor a modo de ejemplo pero hay que usar un classifier
from sklearn.linear_model import LogisticRegression
label_spreading_model = LabelSpreading(alpha=0.4, gamma=10, kernel='knn', max_iter=30, n_jobs=None,
               n_neighbors=13, tol=0.001)
steps.append(("LSP", label_spreading_model))

my_pipe = Pipeline(steps, verbose=True)

my_pipe.fit(X_train, y_train)

y_scores = my_pipe.predict(X_test)

#metrica a utilizar
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix,mean_absolute_error
print("F1 Score: " + str(f1_score(y_test, y_scores, average="macro")))
print("Precision: " + str(precision_score(y_test, y_scores, average="macro")))
print("Recall: " + str(recall_score(y_test, y_scores, average="macro")))

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  19.8s
[Pipeline] ............... (step 2 of 2) Processing LSP, total=   0.8s
F1 Score: 0.6593796871607807
Precision: 0.6647713811892917
Recall: 0.6577729210659014


In [46]:
os.system('say "Terminé gato."')

0

## Entrenamiento con todos los datos para obtener predicciones a subir

In [49]:
X = train.drop(['target'], axis=1) #set de datos
y = train['target'] #target

In [50]:
my_pipe.fit(X,y)

# prediciendo valores
predictions = my_pipe.predict(test)

[Pipeline] ......... (step 1 of 2) Processing col_trans, total=  23.3s
[Pipeline] ............... (step 2 of 2) Processing LSP, total=   1.4s


In [51]:
df_predictions = pd.DataFrame(data={'id':test['id'], 'target':predictions})

In [52]:
description = "6º simple_approach. LabelSpreading - best hiper parameters"
save_submission(df_predictions, description=description)

In [53]:
os.system('say "Terminé gato."')

0

## K folds en nuestro train set

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3, shuffle=True, random_state=seed)

df = pd.DataFrame([])

# UPDATE THIS VALUE
approach_numer = "lucioll_approach_6"

for train_index, test_index in kf.split(train):
    X = train.drop(['target'], axis=1) #set de datos
    y = train['target'] #target
    # for loop copied from docs: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold
    X_train2, X_test2 = X.iloc[train_index], X.iloc[test_index]
    y_train2, y_test2 = y[train_index], y[test_index]
    
    my_pipe.fit(X_train2, y_train2)
    y_scores = my_pipe.predict(X_test2)
    
    print(mean_absolute_error(y_test2, y_scores))
    
    df = df.append(pd.DataFrame(data={'id':X_test2['id'], approach_numer:y_scores}))

df.to_csv("../predictions/on_train_data/" + approach_numer + ".csv", index=False, header=True)