## [75.06 / 95.58] Organización de Datos
## Trabajo Práctico 2: Competencia de Machine Learning
### Grupo 18: DATAVID-20

* 102732 - Bilbao, Manuel
* 101933 - Karagoz, Filyan
* 98684 - Markarian, Darío
* 100901 - Stroia, Lautaro

## 1. Importación general de librerias y set-up de datos.

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import re as re
import os

#Para instalar gensim
#! pip3 install gensim
import gensim
from gensim.parsing.preprocessing import remove_stopwords

#Instalar tensorflow
#!pip3 install tensorflow
import tensorflow
from tensorflow import keras 
from tensorflow.keras.preprocessing.text import Tokenizer

#Para XGBoost
#! pip3 install xgboost
from xgboost import XGBClassifier

import sklearn
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score



import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_rows = None #mostrar todas las filas del df
%matplotlib inline
plt.style.use('default') # haciendo los graficos un poco mas bonitos en matplotlib
plt.rcParams['figure.figsize'] = (20, 10)
sns.set(style="whitegrid") # seteando tipo de grid en seaborn
pd.options.display.float_format = '{:20,.2f}'.format # suprimimos la notacion cientifica en los outputs


### Limpieza de datos
**Definicion de funciones auxiliares**

In [2]:
#Eliminar numeros de un texto
def eliminar_numeros(text):
    return re.sub("\d+", "",text)

#Eliminar puntuacion
def eliminar_puntuacion(text):
    return re.sub(r'[^\w\s]','',text)

#Pasar letras a minusculas
def minusculas(text):
    return text.lower()

#Eliminar caracteres especiales
def eliminar_caracteres(text):
    return re.sub('[^a-zA-Z0-9 \n\.]', '',text)

#Eliminar urls
def eliminar_url(text):
    url_reg = re.compile(r'https?://\S+|www\.\S+')
    return url_reg.sub(r'',text)

### Carga de datos

In [3]:
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

In [4]:
for data in [test,train]:
    data['text'] = data['text'].apply(lambda x: eliminar_puntuacion(x))
    data['text'] = data['text'].apply(lambda x: minusculas(x))
    data['text'] = data['text'].apply(lambda x: eliminar_numeros(x))
    data['text'] = data['text'].apply(lambda x: eliminar_caracteres(x))
    data['text'] = data['text'].apply(lambda x: remove_stopwords(x))
    data['text'] = data['text'].apply(lambda x: eliminar_url(x))


In [5]:
#Vectorizacion
X_train,X_valid,y_train,y_valid = train_test_split(train.text, train.target, test_size=0.33, 
                                                  random_state=42)
print(X_train.shape)
print(X_valid.shape)

(5100,)
(2513,)


In [6]:
#Concateno los tweets
data_text = pd.concat([train.text,test.text])

#Tokenizacion, ya que para entrenar xgboost, necesita vectors con valores numericos
vector = CountVectorizer()
vector.fit(data_text)
X_train_vec = vector.transform(X_train)
X_valid_vec = vector.transform(X_valid)

#Entrenamiento del XGB Classifier
model = XGBClassifier()
model.fit(X_train_vec, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [8]:
#Predicciones sobre set de training
pred1 = model.predict(X_valid_vec)
print("Accuracy: ", np.round(accuracy_score(y_valid,pred1),5))
print("F1 Score :", np.round(f1_score(y_valid, pred1),5))

pred2 = model.predict(X_train_vec)
print("Accuracy: ", np.round(accuracy_score(y_train,pred2),5))
print("F1 Score :", np.round(f1_score(y_train, pred2),5))

Accuracy:  0.78034
F1 Score : 0.70291
Accuracy:  0.83118
F1 Score : 0.77781


In [24]:
#Predicciones con el set de TEST y submit
test_vec = vector.transform(test.text)
y_pred = model.predict(test_vec)

submit = pd.DataFrame(test['id'])
submit['target'] = y_pred
submit.to_csv('submission-xgboost.csv',index=False)