## Обязательно запустить код в ячейке ниже. Это генератор датасета.

In [555]:
import pandas as pd
import numpy as np
import random
from faker import Faker
import warnings
import re
warnings.filterwarnings('ignore')

random.seed(42)
np.random.seed(42)

Faker.seed(42)
fake = Faker()

def generate_random_data():
    passenger_id = random.randint(1, 1000)
    survived = random.choice([0, 1])
    pclass = random.choice([1,2,3,'I','II','III'])
    name = fake.name()
    sex = random.choice(['male', 'female', 'Female', 'Male', 'FEMALE', 'MALE'])
    age = random.uniform(-1, 80)
    sibsp = random.randint(0, 5)
    parch = random.randint(0, 5)
    ticket = fake.bban()
    fare = round(random.uniform(0, 500), 2)
    cabin = fake.building_number() if random.random() > 0.8 else np.nan
    embarked = random.choice(['S', 'C', 'Q'])

    return {
        'PassengerId': passenger_id,
        'Survived': survived,
        'Pclass': pclass,
        'Name': name,
        'Sex': sex,
        'Age': age,
        'SibSp': sibsp,
        'Parch': parch,
        'Ticket': ticket,
        'Fare': fare,
        'Cabin': cabin,
        'Embarked': embarked
    }

def add_missing_values(df, columns, missing_count):
    for col in columns:
        indices = df.sample(missing_count).index
        df.loc[indices, col] = np.nan
    return df

def add_noise_to_age_column(df, noise_percentage):
    rows_with_noise = df.sample(frac=noise_percentage)
    for index, row in rows_with_noise.iterrows():
        age_value = row['Age']
        suffixes = ["y.o.", "years", "y"]
        suffix = random.choice(suffixes)
        df.at[index, 'Age'] = f"{age_value:.0f}{suffix}"
    return df

def add_duplicates(df, full_dupes=5, partial_dupes=5):
    full_duplicates = df.sample(full_dupes, replace=True)
    partial_duplicates = df.sample(partial_dupes, replace=True).copy()
    partial_duplicates['PassengerId'] = [random.randint(1001, 2000) for _ in range(partial_dupes)]
    df = pd.concat([df, full_duplicates, partial_duplicates], ignore_index=True)
    return df

data = [generate_random_data() for _ in range(100)]
df = pd.DataFrame(data)
df = add_noise_to_age_column(df, 0.2)
missing_age_count = int(len(df) * 0.15)
age_indices = df.sample(missing_age_count).index
df.loc[age_indices, 'Age'] = np.nan
columns_to_modify = ['Fare', 'Embarked']
df = add_missing_values(df, columns_to_modify, 8)
df = add_duplicates(df, full_dupes=2, partial_dupes=2)


## Почистить датасет во всех колонках от "малых" проблем.

    1) отсутсвующие значения
    2) неподходящие типы данных у колонок
    3) дубликаты (полные и частичные)
    4) проблемы с категориальными переменными
    5) проблемы значений (грязь в данных)

In [556]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,655,0,1,Allison Hill,MALE,21y,1,1,HEXD81960013389083,368.24,,Q
1,90,1,1,Meredith Barnes,male,6.589314,1,4,OSIZ02654235116155,301.01,,Q
2,666,1,2,Renee Blair,Male,46.73052,0,1,RDMC84959310341316,349.07,,S
3,221,1,1,Mark Diaz,male,29.774111,2,2,LLGV41928327648350,301.86,5641.0,Q
4,471,0,I,Tommy Walter,male,44y,5,4,KGUP67242388496965,442.73,,


In [557]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  104 non-null    int64  
 1   Survived     104 non-null    int64  
 2   Pclass       104 non-null    object 
 3   Name         104 non-null    object 
 4   Sex          104 non-null    object 
 5   Age          88 non-null     object 
 6   SibSp        104 non-null    int64  
 7   Parch        104 non-null    int64  
 8   Ticket       104 non-null    object 
 9   Fare         96 non-null     float64
 10  Cabin        21 non-null     object 
 11  Embarked     95 non-null     object 
dtypes: float64(1), int64(4), object(7)
memory usage: 9.9+ KB


In [558]:
df.describe()

Unnamed: 0,PassengerId,Survived,SibSp,Parch,Fare
count,104.0,104.0,104.0,104.0,96.0
mean,490.903846,0.490385,2.557692,2.413462,266.845937
std,327.465511,0.502328,1.659494,1.669897,141.706277
min,3.0,0.0,0.0,0.0,0.85
25%,220.5,0.0,1.0,1.0,132.74
50%,472.0,0.0,3.0,2.0,291.805
75%,710.25,1.0,4.0,4.0,398.0425
max,1858.0,1.0,5.0,5.0,498.05


In [559]:
df.drop_duplicates(keep = 'first')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,655,0,1,Allison Hill,MALE,21y,1,1,HEXD81960013389083,368.24,,Q
1,90,1,1,Meredith Barnes,male,6.589314,1,4,OSIZ02654235116155,301.01,,Q
2,666,1,2,Renee Blair,Male,46.73052,0,1,RDMC84959310341316,349.07,,S
3,221,1,1,Mark Diaz,male,29.774111,2,2,LLGV41928327648350,301.86,05641,Q
4,471,0,I,Tommy Walter,male,44y,5,4,KGUP67242388496965,442.73,,
...,...,...,...,...,...,...,...,...,...,...,...,...
97,209,1,II,Amber Taylor,female,74.141494,3,5,KZPQ72982595269495,242.36,,Q
98,624,0,III,Katrina Burns,female,24.170761,0,4,SJPA55169409749930,204.00,,
99,473,0,III,Carolyn Fuller,female,,5,2,TYME30913075626368,254.44,,C
102,1306,1,1,Richard Camacho,female,,3,4,QJOU00766177115921,473.03,,Q


In [560]:
df = df.drop_duplicates(subset=["PassengerId"], keep = 'first')

In [561]:
df["Age"] = df["Age"].astype("str")
df["Age"] = df["Age"].apply(lambda x: re.sub(r'[a-zA-Z]+','',x))
df["Age"] = df["Age"].str.replace('..', '.')
df["Age"] = pd.to_numeric(df["Age"])
df["Age"] = round(df["Age"],2)
df.loc[df['Age'] < 0, 'Age'] = df['Age'].median()




In [562]:

df.loc[df['Pclass'] == 'I', 'Pclass'] = '1'
df.loc[df['Pclass'] == 'II', 'Pclass'] = '2'
df.loc[df['Pclass'] == 'III', 'Pclass'] = '3'
df["Pclass"] = df["Pclass"].astype(int)



In [563]:
sum_is_na = df.isna().sum()
sum_is_na_pc = sum_is_na[sum_is_na>0]/len(df)
sum_is_na_pc


Age         0.161616
Fare        0.070707
Cabin       0.787879
Embarked    0.090909
dtype: float64

In [564]:
df = df.drop(columns='Cabin')
df["Age"] = df["Age"].fillna(df["Age"].median())
df["Fare"] = df["Fare"].fillna(df["Fare"].median())
df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode()[0])


In [565]:
df["Sex"]= df["Sex"].str.lower()

In [566]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,655,0,1,Allison Hill,male,21.0,1,1,HEXD81960013389083,368.24,Q
1,90,1,1,Meredith Barnes,male,6.59,1,4,OSIZ02654235116155,301.01,Q
2,666,1,2,Renee Blair,male,46.73,0,1,RDMC84959310341316,349.07,S
3,221,1,1,Mark Diaz,male,29.77,2,2,LLGV41928327648350,301.86,Q
4,471,0,1,Tommy Walter,male,44.0,5,4,KGUP67242388496965,442.73,Q
