In [None]:
import os, types
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

hack_df = pd.read_csv('train.csv')
#hack_df.head(10)
test_df = pd.read_csv('test.csv')
#test_df.head(10)

In [None]:
from scipy.stats import gaussian_kde

# Replacing 'year_built' NaN values with 2022 in test_df
test_df['year_built'].fillna(2022, inplace=True)

# List of columns to replace NaN values with their mean for test_df
columns_to_replace_na = [
    'energy_star_rating', 
    'direction_max_wind_speed', 
    'direction_peak_wind_speed',
    'max_wind_speed',
    'days_with_fog'
]

# Replacing NaN values in selected columns with their respective means in test_df
for col in columns_to_replace_na:
    test_df[col].fillna(test_df[col].mean(), inplace=True)

# Splitting test_df into feature set and target
x_test = test_df.iloc[:,4:62]
y_test = test_df.iloc[:,62]

# Displaying summary statistics and visualizing distribution of selected columns for test_df using Matplotlib
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(15, 12))

# Function to plot histogram and KDE
def plot_hist_kde(data, ax, title, color='lightblue'):
    # Histogram
    ax.hist(data, bins=30, color=color, density=True, alpha=0.6)
    # KDE
    density = gaussian_kde(data)
    xs = np.linspace(data.min(), data.max(), 200)
    ax.plot(xs, density(xs), color='blue')
    ax.set_title(title)

plot_hist_kde(test_df['energy_star_rating'], axes[0, 0], 'Distribution of energy_star_rating')
plot_hist_kde(test_df['direction_max_wind_speed'], axes[0, 1], 'Distribution of direction_max_wind_speed')
plot_hist_kde(test_df['direction_peak_wind_speed'], axes[1, 0], 'Distribution of direction_peak_wind_speed')
plot_hist_kde(test_df['max_wind_speed'], axes[1, 1], 'Distribution of max_wind_speed')
plot_hist_kde(test_df['days_with_fog'], axes[2, 0], 'Distribution of days_with_fog')

plt.tight_layout()
plt.show()


In [None]:
# Re-processing hack_df

# Re-loading the datasets
hack_df = pd.read_csv('train.csv')

# List of columns to replace NaN values with their mean
columns_to_replace_na = [
    'energy_star_rating', 
    'direction_max_wind_speed', 
    'direction_peak_wind_speed',
    'max_wind_speed',
    'days_with_fog'
]

# Replacing 'year_built' NaN values with 2022 in hack_df
hack_df['year_built'].fillna(2022, inplace=True)

# Replacing NaN values in selected columns with their respective means in hack_df
for col in columns_to_replace_na:
    hack_df[col].fillna(hack_df[col].mean(), inplace=True)

# Splitting hack_df into feature set and target
x = hack_df.iloc[:,4:62]
y = hack_df.iloc[:,62]

# Splitting the data into training and testing sets
x_training, x_testing, y_training, y_testing = train_test_split(x, y, test_size=0.2, random_state=42)
x_training.shape, x_testing.shape, y_training.shape, y_testing.shape


In [None]:
#using Linear regression Model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

lr_model = LinearRegression()
lr_model.fit(x_training, y_training)

#predicting the data
lr_predictions = lr_model.predict(x_testing)

#Calculating metrics
lr_mse = mean_squared_error(y_testing, lr_predictions)
lr_mae = mean_absolute_error(y_testing, lr_predictions)
lr_r2 = r2_score(y_testing, lr_predictions)

print('Linear Regression - Mean Squared Error:', lr_mse)
print('Linear Regression - Mean Absolute Error:', lr_mae)
print('Linear Regression - R2 Score:', lr_r2)

#Drawwing the graph
plt.figure(figsize = (10, 6))
plt.scatter(y_testing, lr_predictions, alpha=0.4, color = 'green', marker='>')
plt.plot([y.min(), y.max()], [y.min(), y.max()], '--', lw=2, color='red')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Linear Regression Model Actual vs Predicted Values')
plt.grid = True
plt.show()



In [None]:
#using Decision Tree Regressor, R squared value for Linear Forest suggests that there can be imporvement in the model
from sklearn.tree import DecisionTreeRegressor

dt_model = DecisionTreeRegressor(random_state=41)
dt_model.fit(x_training, y_training)

#predicting the data
dt_predictions = dt_model.predict(x_testing)

#calculating metrics
dt_mse = mean_squared_error(y_testing, dt_predictions)
dt_mae = mean_absolute_error(y_testing, dt_predictions)
dt_r2 = r2_score(y_testing, dt_predictions)

print('Decision Tree - Mean Squared Error:', dt_mse)
print('Decision Tree - Mean Absolute Error:', dt_mae)
print('Decision Tree - R2 Score:', dt_r2)

#plotting the graph
plt.figure(figsize = (10, 6))
plt.scatter(y_testing, dt_predictions, alpha=0.6, color = 'green', marker='<')
plt.plot([y.min(), y.max()], [y.min(), y.max()], '--', lw=2, color='red')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Decision Tree Model Actual vs Predicted Values')
plt.grid = True
plt.show()


In [None]:
#random forest regression
from sklearn.ensemble import RandomForestRegressor

# Initialize and train the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(x_training, y_training)

# Predict on the testing data
rf_predictions = rf_model.predict(x_testing)

# Calculate evaluation metrics
rf_mae = mean_absolute_error(y_testing, rf_predictions)
rf_mse = mean_squared_error(y_testing, rf_predictions)
rf_r2 = r2_score(y_testing, rf_predictions)

# Plotting actual vs predicted values for Random Forest
plt.figure(figsize=(10, 6))
plt.scatter(y_testing, rf_predictions, alpha=0.5, color="blue")
plt.plot([y.min(), y.max()], [y.min(), y.max()], '--', lw=2, color="red")
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Random Forest Regressor: Actual vs Predicted')
plt.grid(True)
plt.show()

rf_mae, rf_mse, rf_r2


In [1]:
#Ridge and Lasso Regression
from sklearn.linear_model import Ridge, Lasso

rid_model = Ridge(alpha=0.01, normalize=True)
rid_model.fit(x_training, y_training)


NameError: name 'x_training' is not defined