# One-Hot-encoding

## Importación de librerias

In [478]:
import pandas as pd
import numpy as np
import re as re
import string
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

## Carga de datos

In [479]:
tweets = pd.read_csv('train.csv',usecols = ['id','text','target'])

## Limpieza de datos

### Definicion de funciones auxiliares

In [480]:
#Eliminar numeros de un texto
def eliminar_numeros(text):
    return re.sub("\d+", "",text)

#Eliminar puntuacion
def eliminar_puntuacion(text):
    return re.sub(r'[^\w\s]','',text)

#Pasar letras a minusculas
def minusculas(text):
    return text.lower()

#Eliminar caracteres especiales
def eliminar_caracteres(text):
    return re.sub('[^a-zA-Z0-9 \n\.]', '',text)

#Eliminar urls
def eliminar_url(text):
    url_reg = re.compile(r'https?://\S+|www\.\S+')
    return url_reg.sub(r'',text)

### Limpieza

In [481]:
for data in [tweets]:
    data['text'] = data['text'].apply(lambda x: eliminar_puntuacion(x))
    data['text'] = data['text'].apply(lambda x: minusculas(x))
    data['text'] = data['text'].apply(lambda x: eliminar_numeros(x))
    data['text'] = data['text'].apply(lambda x: eliminar_caracteres(x))
    data['text'] = data['text'].apply(lambda x: eliminar_url(x))

## Particionando los datos

In [482]:
X_train,X_test,y_train,y_test= train_test_split(tweets.text, tweets.target, test_size=0.32,random_state=20)

## Función de tokenización

In [483]:
def convertir_en_tokens(texto):
    tokens = []
    for palabra in texto.split():
        if not palabra in tokens:
            tokens.append(palabra)
    return tokens

## One-Hot-encoding

In [484]:
vectorizer = CountVectorizer( tokenizer = convertir_en_tokens, binary = True)
X_train = vectorizer.fit_transform(X_train)

In [485]:
X_test = vectorizer.transform(X_test)

In [486]:
clasificador = LinearSVC()
clasificador.fit(X_train,y_train)

LinearSVC()

## Pruebas

### Predicciones sobre el set de training

In [487]:
predicciones = clasificador.predict(X_train)

In [488]:
accuracy = accuracy_score (y_train,predicciones)
print (f"Accuracy: {accuracy:.4%}")

Accuracy: 99.6136%


### Predicciones sobre el set de test

In [489]:
predicciones = clasificador.predict(X_test)

In [490]:
accuracy = accuracy_score (y_test,predicciones)
print (f"Accuracy: {accuracy:.4%}")

Accuracy: 78.9906%


## Preparando el submit de Kaggle

### Lectura de los datos

In [491]:
test = pd.read_csv('test.csv',usecols =['id','text'])

### Limpieza de datos

In [492]:
for data in [tweets]:
    data['text'] = data['text'].apply(lambda x: eliminar_puntuacion(x))
    data['text'] = data['text'].apply(lambda x: minusculas(x))
    data['text'] = data['text'].apply(lambda x: eliminar_numeros(x))
    data['text'] = data['text'].apply(lambda x: eliminar_caracteres(x))
    data['text'] = data['text'].apply(lambda x: eliminar_url(x))

### Realizando Predicciones

In [493]:
submit = pd.DataFrame(test['id'],columns=['id'])
texts = vectorizer.transform(test['text'])
predicciones = clasificador.predict(texts)
submit['target'] = predicciones

### Resultado

In [494]:
submit.head()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,0
4,11,1
