In [70]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import timeit
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import psutil

# Function to measure CPU utilization
def measure_cpu_utilization():
    return psutil.cpu_percent(interval=1)

start = timeit.default_timer()

# Measure initial CPU utilization
initial_cpu = measure_cpu_utilization()

# Load data and measure execution time
start_loading = timeit.default_timer()
data = pd.read_csv("california_housing.csv")
end_loading = timeit.default_timer()
data_loading_time = end_loading - start_loading
loading_cpu = measure_cpu_utilization()
print(f'Data Loading Time: {data_loading_time:.4f} seconds, CPU Utilization: {loading_cpu:.1f}%')

# Data cleaning
data.info()
data.dropna(inplace=True)

# Apply one-hot encoding
start_encoding = timeit.default_timer()
X_encoded = pd.get_dummies(data, columns=['ocean_proximity'], drop_first=True)
end_encoding = timeit.default_timer()
encoding_time = end_encoding - start_encoding
encoding_cpu = measure_cpu_utilization()
print(f'One-Hot Encoding Time: {encoding_time:.4f} seconds, CPU Utilization: {encoding_cpu:.1f}%')

# Separate features (X) and target (y)
X = X_encoded.drop(['median_house_value'], axis=1)
y = X_encoded['median_house_value']

# Split into train and test sets
start_split = timeit.default_timer()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
end_split = timeit.default_timer()
split_time = end_split - start_split
split_cpu = measure_cpu_utilization()
print(f'Data Splitting Time: {split_time:.4f} seconds, CPU Utilization: {split_cpu:.1f}%')

# Random forest training
forest = RandomForestRegressor(n_estimators=100, random_state=0)

# Measure training time and CPU utilization
training_time = timeit.timeit('forest.fit(X_train, y_train)', globals=globals(), number=1)
training_cpu = measure_cpu_utilization()
print(f'Model Training Time: {training_time:.4f} seconds, CPU Utilization: {training_cpu:.1f}%')

# Model scoring
scoring_time = timeit.timeit('forest.score(X_test, y_test)', globals=globals(), number=1)
scoring_cpu = measure_cpu_utilization()
print(f'Model Scoring Time: {scoring_time:.4f} seconds, CPU Utilization: {scoring_cpu:.1f}%')

# Make predictions
start_pred = timeit.default_timer()
y_pred = forest.predict(X_test)
end_pred = timeit.default_timer()
prediction_time = end_pred - start_pred
prediction_cpu = measure_cpu_utilization()
print(f'Prediction Time: {prediction_time:.4f} seconds, CPU Utilization: {prediction_cpu:.1f}%')

# Evaluate model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

print(f'mae: ${mae:.3f}')
print(f'mse: {mse:.3f}')
print(f'rmse: {rmse:.3f}')

end = timeit.default_timer()
overall_time = end - start
overall_cpu = measure_cpu_utilization()
print(f'Program Run Time: {overall_time:.4f} seconds, CPU Utilization: {overall_cpu:.1f}%')


Data Loading Time: 0.0794 seconds, CPU Utilization: 7.2%
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
One-Hot Encoding Time: 0.0132 seconds, CPU Utilization: 4.8%
Data Splitting Time: 0.0094 seconds, CPU Utilization: 3.9%
Model Training Time: 45.9324 seconds, CPU Utilization: 2.7%
Model Scoring 

In [None]:
% git 