### Задание - решение задачи регрессии

#### 1. Загрузка и исследование набора данных
###### Загрузите набор данных и изучите его: объем выборки, количество столбцов, характеристики столбцов (признаков), имеются ли пропуски, имеются ли выбросы

In [1]:
import pandas as pd
import re

df = pd.read_csv('laptop_price.csv', encoding='windows-1251')

print(df.info())
print(df.describe())
print(df.isnull().sum())

object_columns = ['Company', 'Product', 'TypeName', 'ScreenResolution', 'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight']
for column in object_columns:
    print(f"Уникальные значения в столбце {column}: {df[column].unique()[:5]}...")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   laptop_ID         1303 non-null   int64  
 1   Company           1303 non-null   object 
 2   Product           1303 non-null   object 
 3   TypeName          1303 non-null   object 
 4   Inches            1303 non-null   float64
 5   ScreenResolution  1303 non-null   object 
 6   Cpu               1303 non-null   object 
 7   Ram               1303 non-null   object 
 8   Memory            1303 non-null   object 
 9   Gpu               1303 non-null   object 
 10  OpSys             1303 non-null   object 
 11  Weight            1303 non-null   object 
 12  Price_euros       1303 non-null   float64
dtypes: float64(2), int64(1), object(10)
memory usage: 132.5+ KB
None
         laptop_ID       Inches  Price_euros
count  1303.000000  1303.000000  1303.000000
mean    660.155794    15.017

#### 2. Предобработка данных
###### Подумайте как преобразовать столбцы, чтобы привести все к числовому виду: где стоит категории заменить метками, где преобразовать ###### столбцы. Для этого у всех столбцов типа object посмотрите уникальные значения. 

###### Столбцы типа данных object ['Company', 'Product', 'TypeName', 'ScreenResolution', 'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight']
###### Рекомендации:
###### - столбец 'Product' можно удалить, в нем  618 уникальных значений
###### - для столбцов 'RAM'  и 'Weight'  оставить только цифры - объем памяти  и вес, причем в каждом столбце числа должны соответствовать одинаковым единицам Gb и kg
###### - столбец Memory стоит преобразовать к нескольким столбцам, соответствующим Memory_SSD, Memory_HDD, Memory_Flash_Storage, Memory_Hybrid все объемы памяти перевести в GB и записать в соответствующие столбцы
###### - решите что делать с остальными столбцами, обосновывая свои решения

In [2]:
df['Memory'].unique()

array(['128GB SSD', '128GB Flash Storage', '256GB SSD', '512GB SSD',
       '500GB HDD', '256GB Flash Storage', '1TB HDD',
       '32GB Flash Storage', '128GB SSD +  1TB HDD',
       '256GB SSD +  256GB SSD', '64GB Flash Storage',
       '256GB SSD +  1TB HDD', '256GB SSD +  2TB HDD', '32GB SSD',
       '2TB HDD', '64GB SSD', '1.0TB Hybrid', '512GB SSD +  1TB HDD',
       '1TB SSD', '256GB SSD +  500GB HDD', '128GB SSD +  2TB HDD',
       '512GB SSD +  512GB SSD', '16GB SSD', '16GB Flash Storage',
       '512GB SSD +  256GB SSD', '512GB SSD +  2TB HDD',
       '64GB Flash Storage +  1TB HDD', '180GB SSD', '1TB HDD +  1TB HDD',
       '32GB HDD', '1TB SSD +  1TB HDD', '512GB Flash Storage',
       '128GB HDD', '240GB SSD', '8GB SSD', '508GB Hybrid', '1.0TB HDD',
       '512GB SSD +  1.0TB Hybrid', '256GB SSD +  1.0TB Hybrid'],
      dtype=object)

In [3]:
df.drop('Product', axis=1, inplace=True)

df['Ram'] = df['Ram'].str.replace('GB', '').astype(int)
df['Weight'] = df['Weight'].str.replace('kg', '').astype(float)

def process_memory(memory):
    ssd, hdd, flash, hybrid = 0, 0, 0, 0
    for part in memory.split('+'):
        part = part.strip()
        size_match = re.search(r'(\d+)', part)
        type_match = re.search(r'(SSD|HDD|Flash Storage|Hybrid)', part)
        
        if size_match and type_match:
            size = int(size_match.group(1))
            if 'TB' in part:
                size *= 1024
            
            memory_type = type_match.group(1)
            if memory_type == 'SSD':
                ssd += size
            elif memory_type == 'HDD':
                hdd += size
            elif memory_type == 'Flash Storage':
                flash += size
            elif memory_type == 'Hybrid':
                hybrid += size

    return pd.Series([ssd, hdd, flash, hybrid])

df[['Memory_SSD', 'Memory_HDD', 'Memory_Flash_Storage', 'Memory_Hybrid']] = df['Memory'].apply(process_memory)
df.drop('Memory', axis=1, inplace=True)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

categorical_columns = ['Company', 'TypeName', 'ScreenResolution', 'Cpu', 'Gpu', 'OpSys']

train_splits = {}
test_splits = {}
encoders = {}

for column in categorical_columns:
    X_train, X_test = train_test_split(df[[column]], test_size=0.2, random_state=42)
    
    encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    X_train_encoded = encoder.fit_transform(X_train)
    X_test_encoded = encoder.transform(X_test)
    
    train_splits[column] = X_train_encoded
    test_splits[column] = X_test_encoded
    encoders[column] = encoder

    df.loc[X_train.index, column] = X_train_encoded.flatten()
    df.loc[X_test.index, column] = X_test_encoded.flatten()
    

df_multiple_memory_types = df[
    (df['Memory_SSD'] > 0) & 
    ((df['Memory_HDD'] > 0) | (df['Memory_Flash_Storage'] > 0) | (df['Memory_Hybrid'] > 0))
]

print(df_multiple_memory_types)

      laptop_ID Company TypeName  Inches ScreenResolution   Cpu  Ram   Gpu  \
21           22    10.0      1.0    15.6             15.0  65.0    8  67.0   
37           38     4.0      3.0    17.3             15.0  71.0    8   6.0   
41           42     4.0      1.0    15.6             15.0  90.0   16  71.0   
47           48     2.0      1.0    17.3              8.0  21.0    8  32.0   
58           59    11.0      1.0    17.3              8.0  90.0   16  72.0   
...         ...     ...      ...     ...              ...   ...  ...   ...   
1238       1256    11.0      1.0    15.6              8.0  80.0    8  81.0   
1245       1263     2.0      1.0    15.6             15.0  90.0   16  68.0   
1247       1265     2.0      1.0    15.6             15.0  80.0   16  72.0   
1256       1274     2.0      1.0    17.3             15.0  80.0   16  83.0   
1259       1277    11.0      1.0    15.6              8.0  80.0    8  81.0   

     OpSys  Weight  Price_euros  Memory_SSD  Memory_HDD  Memory

##### 3. Столбец laptop_ID  не несет смысловой информации его удалите


##### 4. Сделайте три копии  датасета:  df, df_mm, df_ss. Следующие действия проделайте с каждой из копий:

###### Отделите целевой столбец от нецелевых

###### Разделите данные на обучающую и тестовую выборки

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

df_mm = df.copy()
df_ss = df.copy()

X = df.drop(['Price_euros', 'laptop_ID'], axis=1)
y = df['Price_euros']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler_mm = MinMaxScaler()
X_train_mm = scaler_mm.fit_transform(X_train)
X_test_mm = scaler_mm.transform(X_test)

scaler_ss = StandardScaler()
X_train_ss = scaler_ss.fit_transform(X_train)
X_test_ss = scaler_ss.transform(X_test)


In [5]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score

def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    metrics = {
        'Train MSE': mean_squared_error(y_train, y_train_pred),
        'Test MSE': mean_squared_error(y_test, y_test_pred),
        'Train R2': r2_score(y_train, y_train_pred),
        'Test R2': r2_score(y_test, y_test_pred)
    }
    return metrics

models = {
    "Linear Regression": LinearRegression(),
    "Lasso Regression": Lasso(alpha=0.1),
    "Ridge Regression": Ridge(alpha=0.1)
}

results = {}
for model_name, model in models.items():
    results[model_name] = {
        'Original': evaluate_model(model, X_train, y_train, X_test, y_test),
        'MinMax Scaled': evaluate_model(model, X_train_mm, y_train, X_test_mm, y_test),
        'Standard Scaled': evaluate_model(model, X_train_ss, y_train, X_test_ss, y_test)
}

##### 5. Полиномиальная регрессия

In [6]:
from sklearn.preprocessing import PolynomialFeatures

for degree in [2, 3]:
    poly = PolynomialFeatures(degree=degree)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)
    poly_model = Ridge(alpha=3)
    poly_metrics = evaluate_model(poly_model, X_train_poly, y_train, X_test_poly, y_test)

    results[f'Polynomial Regression (Degree {degree})'] = {
        'Original': poly_metrics  
    }


##### 6. Вывод результатов

In [7]:
for model_name, model_results in results.items():
    print(f'\n{model_name} Results:')
    for data_version, metrics in model_results.items():
        print(f'  {data_version} -> Train MSE: {metrics["Train MSE"]:.4f}, Test MSE: {metrics["Test MSE"]:.4f}, Train R2: {metrics["Train R2"]:.4f}, Test R2: {metrics["Test R2"]:.4f}')



Linear Regression Results:
  Original -> Train MSE: 138181.3576, Test MSE: 162782.3442, Train R2: 0.7140, Test R2: 0.6795
  MinMax Scaled -> Train MSE: 138181.3576, Test MSE: 162782.3442, Train R2: 0.7140, Test R2: 0.6795
  Standard Scaled -> Train MSE: 138181.3576, Test MSE: 162782.3442, Train R2: 0.7140, Test R2: 0.6795

Lasso Regression Results:
  Original -> Train MSE: 138181.5715, Test MSE: 162795.0976, Train R2: 0.7140, Test R2: 0.6795
  MinMax Scaled -> Train MSE: 138191.2472, Test MSE: 162946.6446, Train R2: 0.7140, Test R2: 0.6792
  Standard Scaled -> Train MSE: 138181.5767, Test MSE: 162795.8366, Train R2: 0.7140, Test R2: 0.6795

Ridge Regression Results:
  Original -> Train MSE: 138181.3621, Test MSE: 162783.1618, Train R2: 0.7140, Test R2: 0.6795
  MinMax Scaled -> Train MSE: 138209.9000, Test MSE: 163164.4874, Train R2: 0.7140, Test R2: 0.6788
  Standard Scaled -> Train MSE: 138181.3600, Test MSE: 162786.8604, Train R2: 0.7140, Test R2: 0.6795

Polynomial Regression (Deg

In [8]:
def display_results():
    for model_name in ["Linear Regression", "Lasso Regression", "Ridge Regression"]:
        print(f'\n{model_name}')
        print(f"{'Dataset':<20} {'Train MSE':<12} {'Test MSE':<12} {'Train R2':<10} {'Test R2':<10}")
        for data_version, metrics in results[model_name].items():
            print(f"{data_version:<20} {metrics['Train MSE']:<12.4f} {metrics['Test MSE']:<12.4f} {metrics['Train R2']:<10.4f} {metrics['Test R2']:<10.4f}")

    for degree in [2, 3]:
        model_name = f'Polynomial Regression (Degree {degree})'
        print(f'\n{model_name}')
        print(f"{'Dataset':<20} {'Train MSE':<12} {'Test MSE':<12} {'Train R2':<10} {'Test R2':<10}")
        metrics = results[model_name]['Original']
        print(f"Original{' ' * 15} {metrics['Train MSE']:<12.4f} {metrics['Test MSE']:<12.4f} {metrics['Train R2']:<10.4f} {metrics['Test R2']:<10.4f}")

display_results()



Linear Regression
Dataset              Train MSE    Test MSE     Train R2   Test R2   
Original             138181.3576  162782.3442  0.7140     0.6795    
MinMax Scaled        138181.3576  162782.3442  0.7140     0.6795    
Standard Scaled      138181.3576  162782.3442  0.7140     0.6795    

Lasso Regression
Dataset              Train MSE    Test MSE     Train R2   Test R2   
Original             138181.5715  162795.0976  0.7140     0.6795    
MinMax Scaled        138191.2472  162946.6446  0.7140     0.6792    
Standard Scaled      138181.5767  162795.8366  0.7140     0.6795    

Ridge Regression
Dataset              Train MSE    Test MSE     Train R2   Test R2   
Original             138181.3621  162783.1618  0.7140     0.6795    
MinMax Scaled        138209.9000  163164.4874  0.7140     0.6788    
Standard Scaled      138181.3600  162786.8604  0.7140     0.6795    

Polynomial Regression (Degree 2)
Dataset              Train MSE    Test MSE     Train R2   Test R2   
Original      