In [27]:
import pandas as pd
import numpy as np


df = pd.read_csv("dataset.csv")


df = df.drop("Unnamed: 0", axis=1) # drop index



In [28]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
string_cols = df.select_dtypes(include=['object', 'string']).columns.tolist()

text_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('one_hot', OneHotEncoder(sparse_output=False))
])

number_pipeline = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])


ct = ColumnTransformer([
    ("text_pipe", text_pipeline, string_cols),
    ("number_pipe", number_pipeline, numeric_cols)
])


df = pd.DataFrame(ct.fit_transform(df), columns=ct.get_feature_names_out())
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1935 entries, 0 to 1934
Data columns (total 27 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   text_pipe__blue_no                     1935 non-null   float64
 1   text_pipe__blue_yes                    1935 non-null   float64
 2   text_pipe__four_g_no                   1935 non-null   float64
 3   text_pipe__four_g_yes                  1935 non-null   float64
 4   text_pipe__wifi_no_wifi                1935 non-null   float64
 5   text_pipe__wifi_wifi                   1935 non-null   float64
 6   text_pipe__price_range_high cost       1935 non-null   float64
 7   text_pipe__price_range_low cost        1935 non-null   float64
 8   text_pipe__price_range_medium cost     1935 non-null   float64
 9   text_pipe__price_range_very high cost  1935 non-null   float64
 10  number_pipe__battery_power             1935 non-null   float64
 11  numb

In [None]:
from sklearn.model_selection import train_test_split


X = df.drop(columns=["text_pipe__price_range_high", "text_pipe__price_range_low cost", "text_pipe__price_range_medium cost", "text_pipe__price_range_very high cost"], axis=0)
y = df["text_pipe__price_range_high", "text_pipe__price_range_low cost", "text_pipe__price_range_medium cost", "text_pipe__price_range_very high cost"]

X_train, X_to_split, y_train, y_to_split = train_test_split(X, y, test_size=0.66, random_state=42)
X_validate, X_test, y_validate, y_test = train_test_split(X_to_split, y_to_split, test_size=0.5, random_state=42)


KeyError: "['price_range'] not found in axis"

In [None]:
import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

keras.utils.set_random_seed(42)

model = keras.Sequential([
    keras.layers.Input((n_input_features,)),  # If we write (n_input_features), it will simplify to int. If add a comma, it will be a single-element tuple
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(1)
])

model.summary()

In [None]:
optimizer = keras.optimizers.Adam(learning_rate=0.1)
model.compile(optimizer=optimizer, loss="mse")
initial_weights = model.get_weights()

In [None]:
history = model.fit(X_train, y_train, epochs=100, validation_data=[X_test, y_test])

In [None]:
model_predictions = model.predict(X_test)
mean_absolute_error(y_test, model_predictions)