# Data Exploration for the Blog dataset

In [1]:
import numpy as np
import pandas as pd
import os

from torchvision.datasets.utils import download_url, download_and_extract_archive
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

## `Blog` from `SSDKL`

In [2]:
def load_dataset_from_ssdkl(path_to_folder):
    X = np.load(os.path.join(path_to_folder, "X.npy"))
    y = np.load(os.path.join(path_to_folder, "y.npy"))

    return X, y

PATH_TO_DATASET = "/home/flo/ssdgm/notebooks/datasets/SSDKL/blog"
X_ssdkl, y_ssdkl = load_dataset_from_ssdkl(PATH_TO_DATASET)
X_ssdkl.shape, y_ssdkl.shape

((52397, 280), (52397,))

## `Blog` from `UCI`

In [3]:
URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00304/BlogFeedback.zip"
DOWNLOAD_FOLDER = "/home/flo/ssdgm/notebooks/datasets/UCI/Blog/raw"

download_and_extract_archive(url=URL, download_root=DOWNLOAD_FOLDER)
os.remove(os.path.join(DOWNLOAD_FOLDER, os.path.basename(URL)))


Downloading https://archive.ics.uci.edu/ml/machine-learning-databases/00304/BlogFeedback.zip to /home/flo/ssdgm/notebooks/datasets/UCI/Blog/raw/BlogFeedback.zip


  0%|          | 0/2583605 [00:00<?, ?it/s]

Extracting /home/flo/ssdgm/notebooks/datasets/UCI/Blog/raw/BlogFeedback.zip to /home/flo/ssdgm/notebooks/datasets/UCI/Blog/raw


In [20]:
df_uci = pd.read_csv(os.path.join(DOWNLOAD_FOLDER, "blogData_train.csv"), header=None)
df_uci.shape

(52397, 281)

In [21]:
df_uci.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,271,272,273,274,275,276,277,278,279,280
count,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,...,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0,52397.0
mean,39.444167,46.806717,0.358914,339.853102,24.681661,15.214611,27.959159,0.002748,258.66603,5.829151,...,0.171327,0.162242,0.154455,0.096151,0.088917,0.119167,0.0,1.242094,0.769505,6.764719
std,79.121821,62.359996,6.840717,441.430109,69.598976,32.251189,38.584013,0.131903,321.348052,23.768317,...,0.376798,0.368676,0.361388,0.2948,0.284627,1.438194,0.0,27.497979,20.338052,37.706565
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.285714,5.214318,0.0,29.0,0.0,0.891566,3.075076,0.0,22.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,10.63066,19.35312,0.0,162.0,4.0,4.150685,11.051215,0.0,121.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,40.30467,77.44283,0.0,478.0,15.0,15.998589,45.701206,0.0,387.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1122.6666,559.4326,726.0,2044.0,1314.0,442.66666,359.53006,14.0,1424.0,588.0,...,1.0,1.0,1.0,1.0,1.0,136.0,0.0,1778.0,1778.0,1424.0


In [22]:
for col in df_uci.columns:
    if (df_uci[col] == 0).all():
        print(f"{col} is all zero")

12 is all zero
32 is all zero
37 is all zero
277 is all zero


In [23]:
df_uci.drop(labels = [12, 32, 37, 277], axis=1, inplace=True)

In [24]:
df_uci

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,271,272,273,274,275,276,277,278,279,280
0,40.30467,53.845657,0.0,401.0,15.0,15.52416,32.441880,0.0,377.0,3.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,40.30467,53.845657,0.0,401.0,15.0,15.52416,32.441880,0.0,377.0,3.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,40.30467,53.845657,0.0,401.0,15.0,15.52416,32.441880,0.0,377.0,3.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,40.30467,53.845657,0.0,401.0,15.0,15.52416,32.441880,0.0,377.0,3.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,40.30467,53.845657,0.0,401.0,15.0,15.52416,32.441880,0.0,377.0,3.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52392,33.00000,0.000000,33.0,33.0,33.0,11.00000,15.556349,0.0,33.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52393,33.00000,0.000000,33.0,33.0,33.0,11.00000,15.556349,0.0,33.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52394,0.00000,0.000000,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52395,0.00000,0.000000,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Conclusion

- only choose training set from UCI as dataset because SSDKL authors did the same
- use last column as target variable
- no cleaning steps necessary