In [25]:
! pip install xgboost



In [26]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, learning_curve
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn.inspection import permutation_importance

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

In [29]:
from joblib import dump, load

Loading Data using CSV

In [30]:
df = pd.read_csv('cleaned_data.csv')

In [31]:
df.shape

(2943, 174)

Splitting the Data:

In [32]:

X = df.drop('price', axis=1)
y = df['price']


In [9]:
numeric_features = df.select_dtypes(include=['number']).columns
categorical_features = df.select_dtypes(include=['object', 'category']).columns

print("Numeric Features:", numeric_features)
print("Categorical Features:", categorical_features)

Numeric Features: Index(['productID', 'price'], dtype='object')
Categorical Features: Index(['model', 'color', 'gpu', 'ram', 'harddisk', 'OS', 'brand',
       'screen_size', 'gpu_type', 'CPU_Model',
       ...
       'WORK', 'Water Proof', 'Water Resistant', 'White backlight 84 Key',
       'Wifi', 'built for entertainment', 'camera', 'multitasking',
       'premium business-class notebook', 'privacy'],
      dtype='object', length=172)


In [33]:
# One-hot encoding for categorical variables
X_encoded = pd.get_dummies(X, columns=categorical_features, drop_first=True)

In [34]:
X_train, X_val, y_train, y_val = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Choosing a Model:

Random Forest:



In [35]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


Model Evaluation: After training the model, evaluate its performance using RMSE.



In [36]:
import numpy as np
from sklearn.metrics import root_mean_squared_error
y_pred = model.predict(X_val)
rmse = np.sqrt(root_mean_squared_error(y_val, y_pred))
print('Validation RMSE:', rmse)


Validation RMSE: 608.308753451692


XGBoost

In [38]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [39]:
transformers_list=[
        ("category", OrdinalEncoder(handle_unknown="use_encoded_value", encoded_missing_value=-1, unknown_value=-2), categorical_features),
        ("numeric", "passthrough", numeric_features)
]
preprocess_transformer = ColumnTransformer(transformers_list)

In [40]:
xgb_pipeline = Pipeline([
    ("preprocess", preprocess_transformer),
    ("xgb", XGBClassifier())
])

In [41]:
xgb_params = {
    'xgb__max_depth': [3, 5, 7],
    'xgb__min_child_weight': [0.1, 1, 10]
}

In [42]:
xgb_search = GridSearchCV(estimator = xgb_pipeline,
                                       param_grid = xgb_params,
                                       cv = 5, scoring='f1', verbose=2)

In [43]:
xgb_search.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] END ........xgb__max_depth=3, xgb__min_child_weight=0.1; total time=   0.0s
[CV] END ........xgb__max_depth=3, xgb__min_child_weight=0.1; total time=   0.0s
[CV] END ........xgb__max_depth=3, xgb__min_child_weight=0.1; total time=   0.0s
[CV] END ........xgb__max_depth=3, xgb__min_child_weight=0.1; total time=   0.0s
[CV] END ........xgb__max_depth=3, xgb__min_child_weight=0.1; total time=   0.0s
[CV] END ..........xgb__max_depth=3, xgb__min_child_weight=1; total time=   0.0s
[CV] END ..........xgb__max_depth=3, xgb__min_child_weight=1; total time=   0.0s
[CV] END ..........xgb__max_depth=3, xgb__min_child_weight=1; total time=   0.0s
[CV] END ..........xgb__max_depth=3, xgb__min_child_weight=1; total time=   0.0s
[CV] END ..........xgb__max_depth=3, xgb__min_child_weight=1; total time=   0.0s
[CV] END .........xgb__max_depth=3, xgb__min_child_weight=10; total time=   0.0s
[CV] END .........xgb__max_depth=3, xgb__min_chil

ValueError: 
All the 45 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "/workspaces/VISAI-AI-Bootcamp-2024/.conda/lib/python3.11/site-packages/pandas/core/indexes/base.py", line 3805, in get_loc
    return self._engine.get_loc(casted_key)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "index.pyx", line 167, in pandas._libs.index.IndexEngine.get_loc
  File "index.pyx", line 196, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 7081, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 7089, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'price'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/workspaces/VISAI-AI-Bootcamp-2024/.conda/lib/python3.11/site-packages/sklearn/utils/_indexing.py", line 361, in _get_column_indices
    col_idx = all_columns.get_loc(col)
              ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/VISAI-AI-Bootcamp-2024/.conda/lib/python3.11/site-packages/pandas/core/indexes/base.py", line 3812, in get_loc
    raise KeyError(key) from err
KeyError: 'price'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/workspaces/VISAI-AI-Bootcamp-2024/.conda/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/workspaces/VISAI-AI-Bootcamp-2024/.conda/lib/python3.11/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/VISAI-AI-Bootcamp-2024/.conda/lib/python3.11/site-packages/sklearn/pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/VISAI-AI-Bootcamp-2024/.conda/lib/python3.11/site-packages/sklearn/pipeline.py", line 406, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/VISAI-AI-Bootcamp-2024/.conda/lib/python3.11/site-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/VISAI-AI-Bootcamp-2024/.conda/lib/python3.11/site-packages/sklearn/pipeline.py", line 1310, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/VISAI-AI-Bootcamp-2024/.conda/lib/python3.11/site-packages/sklearn/utils/_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/VISAI-AI-Bootcamp-2024/.conda/lib/python3.11/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/VISAI-AI-Bootcamp-2024/.conda/lib/python3.11/site-packages/sklearn/compose/_column_transformer.py", line 968, in fit_transform
    self._validate_column_callables(X)
  File "/workspaces/VISAI-AI-Bootcamp-2024/.conda/lib/python3.11/site-packages/sklearn/compose/_column_transformer.py", line 536, in _validate_column_callables
    transformer_to_input_indices[name] = _get_column_indices(X, columns)
                                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/VISAI-AI-Bootcamp-2024/.conda/lib/python3.11/site-packages/sklearn/utils/_indexing.py", line 369, in _get_column_indices
    raise ValueError("A given column is not a column of the dataframe") from e
ValueError: A given column is not a column of the dataframe


In [23]:
y_train_predict = xgb_search.predict(X_train)
y_test_predict = xgb_search.predict(X_test)

NotFittedError: This GridSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
# Example of evaluate model
# confusion_report_formatting(y_train, confusion_matrix(y_train, y_train_predict))
# confusion_report_formatting(y_test, confusion_matrix(y_test, y_test_predict))

# print(classification_report(y_train, y_train_predict))
# print(classification_report(y_test, y_test_predict))

In [24]:
score = pd.DataFrame([[
    accuracy_score(y_train, y_train_predict),
    f1_score(y_train, y_train_predict, average='weighted'),
]], index=["xgb (1st)"], columns=["Accuracy", "F1 score"])

NameError: name 'y_train_predict' is not defined

In [None]:
report_train = pd.concat([report_train, score])
report_train

In [None]:
score = pd.DataFrame([[
    accuracy_score(y_test, y_test_predict),
    f1_score(y_test, y_test_predict, average='weighted'),
]], index=["xgb (1st)"], columns=["Accuracy", "F1 score"])

In [None]:
report_test = pd.concat([report_test, score])
report_test

Hyperparameter Tuning:

If the RMSE is not below 300, you can tune the model’s hyperparameters using GridSearchCV.


In [25]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30]
}
grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_


Testing

In [20]:
import numpy as np
from sklearn.metrics import root_mean_squared_error
y_pred = model.predict(X_val)
rmse = np.sqrt(root_mean_squared_error(y_val, y_pred))
print('Validation RMSE:', rmse)

Validation RMSE: 608.308753451692


Load the Test Data:



In [17]:
test_df = pd.read_csv('test.csv')


Make Predictions:

In [18]:
predictions = best_model.predict(test_df)


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- OS
- color
- cpu
- gpu
- model
- ...
Feature names seen at fit time, yet now missing:
- 1.5mm Key-travel_1.0
- 1.5mm Key-travel_Unknown
- 144HZ REFRESH RATE_1.0
- 144HZ REFRESH RATE_Unknown
- 32 GB RAM_1.0
- ...


Prepare Submission:



In [None]:
submission = pd.DataFrame({'productID': test_df['productID'], 'PredictedPrice': predictions})
submission.to_csv('submission.csv', index=False)


Insights and Summary
Summarize the important findings in both notebooks, discussing:

Which features had the most significant impact on price
The final RMSE on the test set
Any insights into how certain components (e.g., RAM, CPU, GPU) affect the price of notebooks.


In [26]:
pip install tpot

Collecting tpot
  Downloading TPOT-0.12.2-py3-none-any.whl.metadata (2.0 kB)
Collecting deap>=1.2 (from tpot)
  Downloading deap-1.4.1.tar.gz (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting update-checker>=0.16 (from tpot)
  Downloading update_checker-0.18.0-py3-none-any.whl.metadata (2.3 kB)
Collecting tqdm>=4.36.1 (from tpot)
  Downloading tqdm-4.66.5-py3-none-any.whl.metadata (57 kB)
Collecting stopit>=1.1.1 (from tpot)
  Downloading stopit-1.1.2.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting requests>=2.3.0 (from update-checker>=0.16->tpot)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting charset-normalizer<4,>=2 (from requests>=2.3.0->update-checker>=0.16->tpot)
  Downloading charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (34 kB)
Collec

In [45]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tpot import TPOTRegressor
from sklearn.metrics import root_mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

tpot = TPOTRegressor(verbosity=2, generations=5, population_size=50, random_state=42)
tpot.fit(X_train, y_train)

# Evaluate the model on the validation set
y_pred = tpot.predict(X_val)
rmse = np.sqrt(root_mean_squared_error(y_val, y_pred))
print('Validation RMSE:', rmse)

# Export the best model
tpot.export('best_model.py')

                                                                                
Generation 1 - Current best internal CV score: -283538.92672299256
                                                                              
Generation 2 - Current best internal CV score: -283538.92672299256
                                                                                  
Generation 3 - Current best internal CV score: -283538.92672299256
                                                                                
Generation 4 - Current best internal CV score: -283393.26981167385
                                                                                
Generation 5 - Current best internal CV score: -274654.0771640582
                                                             
Best pipeline: ExtraTreesRegressor(ZeroCount(RidgeCV(input_matrix)), bootstrap=True, max_features=0.5, min_samples_leaf=6, min_samples_split=11, n_estimators=100)
Validation RMSE: 549.3661091386582




Best pipeline: ExtraTreesRegressor(ZeroCount(RidgeCV(input_matrix)), bootstrap=True, max_features=0.5, min_samples_leaf=6, min_samples_split=11, n_estimators=100)