In [10]:
import pandas as pd
import numpy as np

In [11]:
column_names = []

with open('data/spambase.names', 'r') as f:
    lines = f.readlines()
    for line in lines:
        if line.strip().endswith('continuous.'):
            # El nombre de la columna es la parte antes de ':'
            column_name = line.split(':')[0]
            column_names.append(column_name)

# Agregamos el nombre de la columna final
column_names.append('spam')

In [12]:
# Especificamos la ruta de nuestra data
data_path = "data/spambase.data"

# Leemos nuestra data
df = pd.read_csv(data_path, sep=",", header=None)

# Asignamos los nombres de las columnas
df.columns = column_names

# Mostramos las primeras 5 filas
df.head()   

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [13]:
# Veamos la informacion de nuestra data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4601 entries, 0 to 4600
Data columns (total 58 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   word_freq_make              4601 non-null   float64
 1   word_freq_address           4601 non-null   float64
 2   word_freq_all               4601 non-null   float64
 3   word_freq_3d                4601 non-null   float64
 4   word_freq_our               4601 non-null   float64
 5   word_freq_over              4601 non-null   float64
 6   word_freq_remove            4601 non-null   float64
 7   word_freq_internet          4601 non-null   float64
 8   word_freq_order             4601 non-null   float64
 9   word_freq_mail              4601 non-null   float64
 10  word_freq_receive           4601 non-null   float64
 11  word_freq_will              4601 non-null   float64
 12  word_freq_people            4601 non-null   float64
 13  word_freq_report            4601 

In [14]:
# Veamos la distribucion de la variable spam
df['spam'].value_counts()

spam
0    2788
1    1813
Name: count, dtype: int64

In [15]:
# Importamos la libreria para separar nuestra data
from sklearn.model_selection import train_test_split

# Separamos nuestra data en train y test
train, test = train_test_split(df, test_size=0.2, random_state=42)

In [16]:
# Mostramos la cantidad de filas en train y test
len(train), len(test)

(3680, 921)

In [17]:
# Importamos la libreria para entrenar nuestro modelo
from sklearn.ensemble import RandomForestClassifier

# Creamos nuestro modelo
model = RandomForestClassifier(n_estimators=100, random_state=42)

In [18]:
# Separamos nuestra data en X y y
X_train = train.drop('spam', axis=1)
y_train = train['spam']

In [19]:
# Entrenamos nuestro modelo
model.fit(X_train, y_train)

In [20]:
# Separamos nuestra data de test en X y y
X_test = test.drop('spam', axis=1)
y_test = test['spam']

In [21]:
# Predecimos nuestra data de test
y_pred = model.predict(X_test)

In [23]:
# Importamos la libreria para evaluar nuestro modelo
from sklearn.metrics import accuracy_score

In [26]:
# Evaluamos nuestro modelo
accuracy_score(y_test, y_pred) * 100

95.54831704668838

In [27]:
# Importamos la libreria para guardar nuestro modelo
import joblib

In [29]:
# Guardamos nuestro modelo
joblib.dump(model, "model.pkl")

['model.pkl']