In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Lectura archivo
path = ruta_del_archivo = '../data/online_news_modified.csv'
news_df = pd.read_csv(path)
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40436 entries, 0 to 40435
Data columns (total 62 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   url                            40027 non-null  object
 1   timedelta                      39994 non-null  object
 2   n_tokens_title                 39976 non-null  object
 3   n_tokens_content               39963 non-null  object
 4   n_unique_tokens                39985 non-null  object
 5   n_non_stop_words               39977 non-null  object
 6   n_non_stop_unique_tokens       39956 non-null  object
 7   num_hrefs                      39986 non-null  object
 8   num_self_hrefs                 39988 non-null  object
 9   num_imgs                       39980 non-null  object
 10  num_videos                     39987 non-null  object
 11  average_token_length           40031 non-null  object
 12  num_keywords                   39981 non-null  object
 13  d

In [3]:
news_df.head()

Unnamed: 0,url,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,...,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares,mixed_type_col
0,http://mashable.com/2013/01/07/amazon-instant-...,731.0,12.0,219.0,0.663594466988,0.999999992308,0.815384609112,4.0,2.0,1.0,...,0.7,-0.35,-0.6,-3.4000000000000004,0.5,-0.1875,0.0,0.1875,593.0,bad
1,http://mashable.com/2013/01/07/ap-samsung-spon...,731.0,9.0,255.0,0.604743080614,0.999999993289,0.79194630341,3.0,1.0,1.0,...,0.7,-0.11875,-0.125,-0.1,0.0,0.0,0.5,0.0,711.0,639
2,http://mashable.com/2013/01/07/apple-40-billio...,731.0,9.0,211.0,0.575129530699,0.999999991597,0.66386554064,3.0,1.0,1.0,...,1.0,-0.466666666667,-0.8,-0.133333333333,0.0,0.0,0.5,0.0,1500.0,unknown
3,http://mashable.com/2013/01/07/astronaut-notre...,731.0,9.0,531.0,0.503787877834,0.999999996904,0.665634672862,9.0,0.0,1.0,...,0.8,-0.369696969697,-0.6,-0.166666666667,0.0,0.0,0.5,,1200.0,688
4,http://mashable.com/2013/01/07/att-u-verse-apps/,731.0,13.0,1072.0,0.41564561695,0.999999998565,0.540889525766,19.0,19.0,20.0,...,1.0,-0.220192307692,error,-0.05,0.454545454545,0.136363636364,0.0454545454545,0.136363636364,505.0,579


In [4]:
# Obtiene los string dentro de todos los datos numericos, para sustituirlos por NaN


def es_string_real(valor):
    """
    Devuelve True si el valor es un string y no se puede
    convertir a un número de punto flotante (float).
    """
    try:
        # Intenta convertir el valor a float.
        float(valor)
        # Si tiene éxito, significa que es un número (o un string numérico).
        # En ese caso, no es un "string real".
        return False
    except (ValueError, TypeError):
        # Si la conversión falla con un error, significa que el valor
        # no es numérico (ej: "Error_1"), por lo tanto es un string real.
        # El TypeError se añade para manejar valores que no son ni string
        # ni número, como None o listas.
        return True
    

all_values = news_df.iloc[:, 1:].stack()

# b. Filtramos la Serie para quedarnos únicamente con los valores que son de tipo string (str).
string_series_reales = all_values[all_values.apply(es_string_real)]

# c. Convertimos la Serie filtrada a una lista para conocer que palabras tenemos en los datos.
list_de_strings = list(set(string_series_reales))

print(list_de_strings)

['?', 'bad', ' INVALID ', 'invalid', 'unknown', ' ? ', 'error', ' n/a ', ' null ', ' ERROR ']


In [5]:
# Como todas la palabras indican que debe ser un valor nulo, es decir transformalos a NaN

news_df = news_df.replace(list_de_strings, np.nan)

# Columnas a convertir en numericos
cols_to_convert = news_df.columns[1:]

news_df[cols_to_convert] = news_df[cols_to_convert].apply(pd.to_numeric, errors='coerce')

news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40436 entries, 0 to 40435
Data columns (total 62 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   url                            40027 non-null  object 
 1   timedelta                      39839 non-null  float64
 2   n_tokens_title                 39822 non-null  float64
 3   n_tokens_content               39805 non-null  float64
 4   n_unique_tokens                39834 non-null  float64
 5   n_non_stop_words               39855 non-null  float64
 6   n_non_stop_unique_tokens       39799 non-null  float64
 7   num_hrefs                      39839 non-null  float64
 8   num_self_hrefs                 39828 non-null  float64
 9   num_imgs                       39840 non-null  float64
 10  num_videos                     39849 non-null  float64
 11  average_token_length           39902 non-null  float64
 12  num_keywords                   39831 non-null 

In [6]:
# De la columna URL, se deben eliminar las entradas nulas o vacias, pues si no se puede asociar a una web no es identificable

# Elimina los vacios
news_df = news_df[news_df['url'].notna() & (news_df['url'] != '')]

# lo convierte en string
news_df['url'] = news_df['url'].astype(str)
news_df['url'] = news_df['url'].str.lower()

# Elimina espacios antes y despues del url
news_df['url'] = news_df['url'].str.strip()

# Guarda solo los datos que empiezan con http
news_df = news_df[news_df['url'].str.startswith('http', na=False)]

news_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40010 entries, 0 to 40435
Data columns (total 62 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   url                            40010 non-null  object 
 1   timedelta                      39418 non-null  float64
 2   n_tokens_title                 39403 non-null  float64
 3   n_tokens_content               39386 non-null  float64
 4   n_unique_tokens                39410 non-null  float64
 5   n_non_stop_words               39436 non-null  float64
 6   n_non_stop_unique_tokens       39381 non-null  float64
 7   num_hrefs                      39420 non-null  float64
 8   num_self_hrefs                 39407 non-null  float64
 9   num_imgs                       39418 non-null  float64
 10  num_videos                     39429 non-null  float64
 11  average_token_length           39484 non-null  float64
 12  num_keywords                   39417 non-null  floa

In [7]:
# Porcentaje de datos nulos por columna

porcentaje_nulos = (news_df.isnull().sum() / len(news_df)) * 100
print(porcentaje_nulos)


url                              0.000000
timedelta                        1.479630
n_tokens_title                   1.517121
n_tokens_content                 1.559610
n_unique_tokens                  1.499625
                                  ...    
title_sentiment_polarity         1.544614
abs_title_subjectivity           1.534616
abs_title_sentiment_polarity     1.509623
shares                           1.559610
mixed_type_col                  29.837541
Length: 62, dtype: float64
