## Лаб 2
### Обработка пропусков в данных, кодирование категориальных признаков, масштабирование данных
### Кабанец РТ5-51Б

In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.calibration import LabelEncoder

In [3]:
# Загрузка данных
data = pd.read_csv('./data/coursera_courses.csv', sep=",")
data.head()

Unnamed: 0,course_title,course_organization,course_certificate_type,course_time,course_rating,course_reviews_num,course_difficulty,course_url,course_students_enrolled,course_skills,course_summary,course_description
0,(ISC)² Systems Security Certified Practitioner...,ISC2,Specialization,3 - 6 Months,4.7,492.0,Beginner,https://www.coursera.org/specializations/sscp-...,6958.0,"['Risk Management', 'Access Control', 'Asset',...",[],Pursue better IT security job opportunities an...
1,.NET FullStack Developer,Board Infinity,Specialization,1 - 3 Months,4.3,51.0,Intermediate,https://www.coursera.org/specializations/dot-n...,2531.0,"['Web API', 'Web Development', 'Cascading Styl...",['Master .NET full stack web dev: from .NET co...,Develop the proficiency required to design and...
2,21st Century Energy Transition: how do we make...,University of Alberta,Course,1 - 3 Months,4.8,62.0,Beginner,https://www.coursera.org/learn/21st-century-en...,4377.0,[],['Understand the complexity of systems supplyi...,"Affordable, abundant and reliable energy is fu..."
3,A Crash Course in Causality: Inferring Causal...,University of Pennsylvania,Course,1 - 3 Months,4.7,517.0,Intermediate,https://www.coursera.org/learn/crash-course-in...,39004.0,"['Instrumental Variable', 'Propensity Score Ma...",[],We have all heard the phrase “correlation does...
4,A life with ADHD,University of Geneva,Course,1 - 3 Months,,,Beginner,https://www.coursera.org/learn/life-with-adhd,,"['differential diagnosis and comorbidities', '...",[' Understand what ADHD is and the challenges ...,What is ADHD and what are the challenges that ...


In [6]:
missing_values_per_column = data.isnull().sum()
columns_with_missing_values = missing_values_per_column[missing_values_per_column > 0]

if columns_with_missing_values.any():
  print("Столбцы с пропусками после заполнения:")
  for col, missing_count in missing_values_per_column[missing_values_per_column > 0].items():
    print(f"{col}: {missing_count}")
else:
  print("В данных после заполнения нет пропусков.")

В данных после заполнения нет пропусков.


### Заполнение пропусков

In [5]:
# Определение числовых и категориальных признаков
numeric_features = data.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = data.select_dtypes(exclude=[np.number]).columns.tolist()

# Инициализация SimpleImputer для числовых и категориальных признаков
imputer_num = SimpleImputer(strategy='median')
imputer_cat = SimpleImputer(strategy='most_frequent')

# Заполнение пропусков
data[numeric_features] = imputer_num.fit_transform(data[numeric_features])
data[categorical_features] = imputer_cat.fit_transform(data[categorical_features])

# Проверка пункт выше

### Кодирование категориальных признаков

In [7]:
# категориальные признаки
print("категориальные признаки:\n", categorical_features)
print(data["course_difficulty"])

категориальные признаки:
 ['course_title', 'course_organization', 'course_certificate_type', 'course_time', 'course_reviews_num', 'course_difficulty', 'course_url', 'course_students_enrolled', 'course_skills', 'course_summary', 'course_description']
0          Beginner
1      Intermediate
2          Beginner
3      Intermediate
4          Beginner
           ...     
995    Intermediate
996        Beginner
997        Beginner
998        Beginner
999    Intermediate
Name: course_difficulty, Length: 1000, dtype: object


In [8]:
# Кодирование категориальных признаков
label_encoders = {}
for column in categorical_features:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

print("Данные после кодирования LabelEncoder:\n", data.head())

Данные после кодирования LabelEncoder:
    course_title  course_organization  course_certificate_type  course_time  \
0             0                   51                        3            2   
1             1                   11                        3            0   
2             2                  120                        0            0   
3             3                  142                        0            0   
4             4                  132                        0            0   

   course_rating  course_reviews_num  course_difficulty  course_url  \
0            4.7                 320                  1         960   
1            4.3                 339                  2         727   
2            4.8                 382                  1           1   
3            4.7                 343                  2          92   
4            4.7                   3                  1         285   

   course_students_enrolled  course_skills  course_summary  cour

### Масштабирование данных

In [9]:
from sklearn.preprocessing import StandardScaler

# Инициализация StandardScaler
scaler = StandardScaler()

# Масштабирование числовых признаков
data[numeric_features] = scaler.fit_transform(data[numeric_features])

print("Данные после масштабирования:")
data.head()

Данные после масштабирования:


Unnamed: 0,course_title,course_organization,course_certificate_type,course_time,course_rating,course_reviews_num,course_difficulty,course_url,course_students_enrolled,course_skills,course_summary,course_description
0,0,51,3,2,0.111461,320,1,960,783,607,587,473
1,1,11,3,0,-1.962227,339,2,727,297,749,392,133
2,2,120,0,0,0.629883,382,1,1,584,806,497,29
3,3,142,0,0,0.111461,343,2,92,565,399,587,919
4,4,132,0,0,0.111461,3,1,285,459,768,12,954
