# 1 - Preparación del dataset

## 0. Imports

In [80]:
# Manejo de datos
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# Utilidades
import sys
sys.path.append("..\..")


## 1. Lectura del conjunto de datos

In [81]:
df = pd.read_csv('../../data/originals/train.csv')
df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


## 2. Limpieza del conjunto de datos

In [82]:
from package_utilities import clean_text_field
df_filtered = clean_text_field(df)
df_filtered

Unnamed: 0,id,keyword,location,text,target
0,1,,,deeds reason earthquake may allah forgive us,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,residents asked shelter place notified officer...,1
3,6,,,13000 people receive wildfires evacuation orde...,1
4,7,,,got sent photo ruby alaska smoke wildfires pou...,1
...,...,...,...,...,...
7608,10869,,,two giant cranes holding bridge collapse nearb...,1
7609,10870,,,ariaahrary thetawniest control wild fires cali...,1
7610,10871,,,m194 0104 utc5km volcano hawaii,1
7611,10872,,,police investigating ebike collided car little...,1


In [83]:
"""
from package_utilities import remove_url, remove_emoji, remove_punctuation, remove_stopwords, remove_html

df_filtered = df.copy()

df_filtered['text'] = df_filtered['text'].apply(remove_url)
df_filtered['text'] = df_filtered['text'].apply(remove_html)
df_filtered['text'] = df_filtered['text'].apply(remove_emoji)
df_filtered['text'] = df_filtered['text'].apply(remove_punctuation)
df_filtered['text'] = df_filtered['text'].apply(remove_stopwords)

df_filtered.drop(axis=1, columns=["id", "keyword", "location"], inplace=True)

df_filtered
"""

'\nfrom package_utilities import remove_url, remove_emoji, remove_punctuation, remove_stopwords, remove_html\n\ndf_filtered = df.copy()\n\ndf_filtered[\'text\'] = df_filtered[\'text\'].apply(remove_url)\ndf_filtered[\'text\'] = df_filtered[\'text\'].apply(remove_html)\ndf_filtered[\'text\'] = df_filtered[\'text\'].apply(remove_emoji)\ndf_filtered[\'text\'] = df_filtered[\'text\'].apply(remove_punctuation)\ndf_filtered[\'text\'] = df_filtered[\'text\'].apply(remove_stopwords)\n\ndf_filtered.drop(axis=1, columns=["id", "keyword", "location"], inplace=True)\n\ndf_filtered\n'

In [84]:
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


### Guardado

In [85]:
df_filtered.to_parquet("../../data/modifications/train_filtered.parquet", engine='pyarrow')

### Test

In [86]:
test_filtered = clean_text_field(pd.read_csv('../../data/originals/test.csv').copy())
test_filtered.drop(axis=1, columns=["id","keyword","location"], inplace=True)
test_filtered


Unnamed: 0,text
0,happened terrible car crash
1,heard earthquake different cities stay safe ev...
2,forest fire spot pond geese fleeing across str...
3,apocalypse lighting spokane wildfires
4,typhoon soudelor kills 28 china taiwan
...,...
3258,earthquake safety los angeles ûò safety faste...
3259,storm ri worse last hurricane city3others hard...
3260,green line derailment chicago
3261,meg issues hazardous weather outlook hwo


In [87]:
"""
test_filtered['text'] = test_filtered['text'].apply(remove_url)
test_filtered['text'] = test_filtered['text'].apply(remove_html)
test_filtered['text'] = test_filtered['text'].apply(remove_emoji)
test_filtered['text'] = test_filtered['text'].apply(remove_punctuation)
test_filtered['text'] = test_filtered['text'].apply(remove_stopwords)

test_filtered.drop(axis=1, columns=["id","keyword","location"], inplace=True)
test_filtered.to_parquet("../../data/modifications/test_filtered.parquet", engine='pyarrow')
test_filtered
"""

'\ntest_filtered[\'text\'] = test_filtered[\'text\'].apply(remove_url)\ntest_filtered[\'text\'] = test_filtered[\'text\'].apply(remove_html)\ntest_filtered[\'text\'] = test_filtered[\'text\'].apply(remove_emoji)\ntest_filtered[\'text\'] = test_filtered[\'text\'].apply(remove_punctuation)\ntest_filtered[\'text\'] = test_filtered[\'text\'].apply(remove_stopwords)\n\ntest_filtered.drop(axis=1, columns=["id","keyword","location"], inplace=True)\ntest_filtered.to_parquet("../../data/modifications/test_filtered.parquet", engine=\'pyarrow\')\ntest_filtered\n'

## 3. Vectorización del conjunto de datos (TfidfVectorizer)

### Train

In [88]:
Y_train = df_filtered['target']
Y_train

0       1
1       1
2       1
3       1
4       1
       ..
7608    1
7609    1
7610    1
7611    1
7612    1
Name: target, Length: 7613, dtype: int64

In [89]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

# Datos
X_train = vectorizer.fit_transform(df_filtered['text'])

# Features
features = vectorizer.get_feature_names_out()
df_features = pd.DataFrame(features)

In [90]:
df_features

Unnamed: 0,0
0,0011
1,001116
2,0025
3,005225
4,010156
...,...
17677,ûóher
17678,ûókody
17679,ûónegligence
17680,ûótech


In [91]:
X_train = X_train.toarray()
X_train


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [92]:
len(X_train)

7613

### Test

In [93]:
X_test = vectorizer.transform(test_filtered['text'])
X_test = X_test.toarray()
X_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### Guardado

In [94]:
X_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [95]:
# Convertir los índices de columnas a strings para X_train
X_train_df = pd.DataFrame(X_train)
X_train_df.columns = X_train_df.columns.astype(str)
X_train_df.to_parquet("../../data/modifications/TfidfVectorizer/X_Train.parquet", engine='pyarrow')

# Convertir los índices de columnas a strings para Y_train
Y_train_df = pd.DataFrame(Y_train)
Y_train_df.columns = Y_train_df.columns.astype(str)
Y_train_df.to_parquet("../../data/modifications/TfidfVectorizer/Y_train.parquet", engine='pyarrow')

# Convertir los índices de columnas a strings para df_features
df_features.columns = df_features.columns.astype(str)
df_features.to_parquet("../../data/modifications/TfidfVectorizer/X_Train_Features.parquet", engine='pyarrow')

# Convertir los índices de columnas a strings para X_test
X_test_df = pd.DataFrame(X_test)
X_test_df.columns = X_test_df.columns.astype(str)
X_test_df.to_parquet("../../data/modifications/TfidfVectorizer/X_test.parquet", engine='pyarrow')


## 4. División del conjunto de datos

In [96]:
from sklearn.model_selection import train_test_split

X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.15, random_state=42)

In [97]:
X_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [98]:
print("Longitud subcojunto de entrenamiento: ", len(X_train))
print("Longitud subconjunto de validación: ", len(X_val))
print("Longitud subconjutno de pruebas: ", len(X_test))

Longitud subcojunto de entrenamiento:  6471
Longitud subconjunto de validación:  1142
Longitud subconjutno de pruebas:  3263


In [102]:
# Convierte los nombres de las columnas a strings para X_train y X_val
df_X_train = pd.DataFrame(X_train)
df_X_train.columns = df_X_train.columns.astype(str)
df_X_train.to_parquet("../../data/train_set_test/TfidfVectorizer/X_train.parquet", engine='pyarrow')

df_X_val = pd.DataFrame(X_val)
df_X_val.columns = df_X_val.columns.astype(str)
df_X_val.to_parquet("../../data/train_set_test/TfidfVectorizer/X_val.parquet", engine='pyarrow')

# Convierte los nombres de las columnas a strings para Y_train y Y_val si es necesario
df_Y_train = pd.DataFrame(Y_train)
df_Y_train.columns = df_Y_train.columns.astype(str)
df_Y_train.to_parquet("../../data/train_set_test/TfidfVectorizer/Y_train.parquet", engine='pyarrow')

df_Y_val = pd.DataFrame(Y_val)
df_Y_val.columns = df_Y_val.columns.astype(str)
df_Y_val.to_parquet("../../data/train_set_test/TfidfVectorizer/Y_val.parquet", engine='pyarrow')
