Тетрадка для предобработки данных.

# Import

In [1]:
import os
from dotenv import load_dotenv

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer, StandardScaler, LabelEncoder

import torch
from torch.utils.data import Dataset


load_dotenv()

True

# Config

In [2]:
INPUT_DATA_PATH = os.getenv("INPUT_DATA_PATH")
OUTPUT_DATA_PATH = os.getenv("OUTPUT_DATA_PATH")

# Classes and functions

Реализация класса для работы с данными ([документация](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html)).

In [None]:
class CustomImageDataset(Dataset):
    def __init__(self, dataset_path: str, label_name: str, transform: Pipeline | None = None, target_transform: Pipeline | None = None):
        dataset = pd.read_csv(dataset_path)
        self.X, self.y = dataset.drop(label_name, axis=1), dataset[label_name]
        self.transform = transform
        self.target_transform = target_transform

        if self.transform:
            self.X = self.transform.fit_transform(self.X)
        
        if self.target_transform:
            self.y = self.target_transform.fit_transform(self.y)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X.iloc[idx], self.y.iloc[idx]

# Look data

In [10]:
user_ds_path = os.path.join(INPUT_DATA_PATH, "Предсказание количества просмотров рекламы/users.tsv")
history_ds_path = os.path.join(INPUT_DATA_PATH, "Предсказание количества просмотров рекламы/history.tsv")
validate_ds_path = os.path.join(INPUT_DATA_PATH, "Предсказание количества просмотров рекламы/validate.tsv")

validate_answers_ds_path = os.path.join(INPUT_DATA_PATH, "Предсказание количества просмотров рекламы/validate_answers.tsv")

In [18]:
user_ds = pd.read_csv(user_ds_path, sep="\t")
history_ds = pd.read_csv(history_ds_path, sep="\t")
validate_ds = pd.read_csv(validate_ds_path, sep="\t")

validate_answers_ds = pd.read_csv(validate_answers_ds_path, sep="\t")

## Missed values

In [35]:
user_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27769 entries, 0 to 27768
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   user_id  27769 non-null  int64
 1   sex      27769 non-null  int64
 2   age      27769 non-null  int64
 3   city_id  27769 non-null  int64
dtypes: int64(4)
memory usage: 867.9 KB


In [36]:
history_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1147857 entries, 0 to 1147856
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   hour       1147857 non-null  int64  
 1   cpm        1147857 non-null  float64
 2   publisher  1147857 non-null  int64  
 3   user_id    1147857 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 35.0 MB


In [37]:
validate_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1008 entries, 0 to 1007
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   cpm            1008 non-null   float64
 1   hour_start     1008 non-null   int64  
 2   hour_end       1008 non-null   int64  
 3   publishers     1008 non-null   object 
 4   audience_size  1008 non-null   int64  
 5   user_ids       1008 non-null   object 
dtypes: float64(1), int64(3), object(2)
memory usage: 47.4+ KB


In [38]:
validate_answers_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1008 entries, 0 to 1007
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   at_least_one    1008 non-null   float64
 1   at_least_two    1008 non-null   float64
 2   at_least_three  1008 non-null   float64
dtypes: float64(3)
memory usage: 23.8 KB


___
В данных нет None значений, но нужно исследовать значения признаков.

## Values analyse

In [39]:
user_ds.head()

Unnamed: 0,user_id,sex,age,city_id
0,0,2,19,0
1,1,1,0,1
2,2,2,24,2
3,3,1,20,3
4,4,2,29,4


In [None]:
users_ds

# Preprocess

In [None]:
# Define the data path and taget
dataset_path = os.path.join(INPUT_DATA_PATH, "name_of_file.csv")
label_name = "label_name"
# Define the transformers
feature_transform = Pipeline([
    ("normalizer", Normalizer()),
    ("scaler", StandardScaler())
])
target_transform = Pipeline([
    ("label_encoder", LabelEncoder())
])
# Create the dataset
train_dataset = CustomImageDataset(
    dataset_path=dataset_path,
    label_name=label_name,
    transform=feature_transform,
    target_transform=target_transform
)