In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.cluster import KMeans
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer

# Load data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Drop 'id' as it's not a feature
test_ids = test_df['id']
train_df.drop(columns=['id'], inplace=True)
test_df.drop(columns=['id'], inplace=True)

# Handle missing values
train_df.ffill(inplace=True)
test_df.ffill(inplace=True)

# Identify categorical and numerical columns
categorical_cols = ['Brand', 'Material', 'Size', 'Style', 'Color', 'Laptop Compartment', 'Waterproof']
numerical_cols = [col for col in train_df.columns if col not in categorical_cols + ['Price']]

# Encode categorical features
label_encoders = {}
for col in categorical_cols:
    if col in train_df and col in test_df:
        le = LabelEncoder()
        train_df[col] = le.fit_transform(train_df[col].astype(str))
        test_df[col] = le.transform(test_df[col].astype(str))
        label_encoders[col] = le

# Define features and target
X = train_df.drop(columns=['Price'])
y = train_df['Price']

# Split training data for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle missing values in numerical features
imputer = SimpleImputer(strategy='mean')
X_train[numerical_cols] = imputer.fit_transform(X_train[numerical_cols])
X_val[numerical_cols] = imputer.transform(X_val[numerical_cols])
test_df[numerical_cols] = imputer.transform(test_df[numerical_cols])

# Scale numerical features
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_val[numerical_cols] = scaler.transform(X_val[numerical_cols])
test_df[numerical_cols] = scaler.transform(test_df[numerical_cols])

# Models
models = {
    "SVM": SVR(kernel='rbf'),
    "Neural Network": MLPRegressor(hidden_layer_sizes=(100,50), max_iter=500, random_state=42),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42)
}

# Train models and evaluate
predictions = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    mae = mean_absolute_error(y_val, y_pred)
    print(f"{name} MAE: {mae:.2f}")
    predictions[name] = model.predict(test_df)

# Clustering model (KMeans)
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
kmeans.fit(X)
kmeans_pred = kmeans.predict(test_df)
predictions["KMeans"] = kmeans_pred * (y.max() / 4)

# Ensemble averaging
ensemble_pred = np.mean(np.array(list(predictions.values())), axis=0)

# Create submission file
submission_df = pd.DataFrame({"id": test_ids, "Price": ensemble_pred})
submission_df.to_csv("sample_submission.csv", index=False)

print("Predictions saved to sample_submission.csv")


SVM MAE: 33.65
Neural Network MAE: 33.68
Decision Tree MAE: 45.52
Random Forest MAE: 34.35
Gradient Boosting MAE: 33.64


[WinError 2] The system cannot find the file specified
  File "c:\Users\sandu\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "c:\Users\sandu\AppData\Local\Programs\Python\Python313\Lib\subprocess.py", line 554, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\sandu\AppData\Local\Programs\Python\Python313\Lib\subprocess.py", line 1036, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                   

Predictions saved to sample_submission.csv
