# Week 1: Data Cleaning

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df=pd.read_csv('smart_home_energy_consumption.csv')  #Loading the dataset

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
print('Total number of rows in the dataset:',df.shape[0])
print('Total number of columns in the dataset:',df.shape[1])

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
for i in df.columns:
    print('Count of',i)
    print('-----------------------------------------')
    print(df[i].value_counts(),'\n')
    print('Proportion of',i)
    print('-----------------------------------------')
    print(df[i].value_counts(normalize=True),'\n\n')

### Checking for duplicates:

In [None]:
if df.duplicated().sum()==0:
    print('There are no duplicate rows in the dataset.')
else:
    print('Total number of duplicate rows in the dataset :',df.duplicated().sum())

### Checking for missing values.

In [None]:
df.isnull().sum()

In [None]:
missing = pd.DataFrame([df.columns, df.isnull().sum(), df.isnull().sum()/df.index.size*100])
missing = missing.T
missing.columns = ['Column_name','Missing_values','Percentage_missing_values']
missing

#### Visually inspecting the missing values in the dataset using heatmap (white bars show missing values)

In [None]:
plt.figure(figsize = (22,10))
plt.xticks(fontsize=15)
sns.heatmap(df.isnull(), cbar=False, cmap = 'ocean', yticklabels = False)
plt.show()

In [None]:
# Since there are missing values in numeric columns, we will fill the missing values using median imputation for robustness.
# Handling missing values by filling them with median

columns_with_missing_values = ['Energy_Consumption_kWh', 'Temperature_C', 'Humidity_%', 'HVAC_Usage_kWh']

# Filling missing values with the median of each column
df[columns_with_missing_values] = df[columns_with_missing_values].apply(lambda col: col.fillna(col.median()))

# Verify if the missing values have been handled
df.isnull().sum()


### Checking the Outliers:

In [None]:
#Get boxplots for all the numerical columns
numeric_columns = df.select_dtypes(include=np.number).columns.tolist()

plt.figure(figsize=(18, 30))

for i, variable in enumerate(numeric_columns):
    plt.subplot(15, 4, i + 1)
    sns.boxplot(data=df, x=variable)  ## Complete the code to get boxplots for all numerical columns
    plt.tight_layout()
    plt.title(variable)
plt.show()

### Outliers Count:

In [None]:
numeric_columns = df.select_dtypes(include=np.number).columns.tolist()

In [None]:
outliers_count = {}

# Iterate over each column in the DataFrame
for column in numeric_columns:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)

    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR  ## Fill the blank with correct value for getting lower_bound
    upper_bound = Q3 + 1.5 * IQR  ## Fill the blank with correct value for getting upper_bound

    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    outliers_count[column] = len(outliers)

print("Number of outliers in each column:")
pd.DataFrame([{'Column': column, 'No. of outliers': outliers} for column, outliers in outliers_count.items()])

In [None]:
# Capping outliers using the IQR method on the df dataset
for column in df.select_dtypes(include=np.number).columns:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Cap the outliers in the training set
    df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
    df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])

   

# Display a message that the outliers in df have been treated
"Outliers have been capped using the IQR method in the df dataset."

### Outlier check after treatment:

In [None]:
outliers_count = {}

# Iterate over each column in the DataFrame
for column in numeric_columns:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)

    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR  ## Fill the blank with correct value for getting lower_bound
    upper_bound = Q3 + 1.5 * IQR  ## Fill the blank with correct value for getting upper_bound

    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    outliers_count[column] = len(outliers)

print("Number of outliers in each column:")
pd.DataFrame([{'Column': column, 'No. of outliers': outliers} for column, outliers in outliers_count.items()])

#### Boxplots after outlier treatment.#Get boxplots for all the numerical columns


In [None]:
#Get boxplots for all the numerical columns
numeric_columns = df.select_dtypes(include=np.number).columns.tolist()

plt.figure(figsize=(18, 30))

for i, variable in enumerate(numeric_columns):
    plt.subplot(15, 4, i + 1)
    sns.boxplot(data=df, x=variable)  ## Complete the code to get boxplots for all numerical columns
    plt.tight_layout()
    plt.title(variable)
plt.show()

### Time-Series Consistency

In [None]:
# First, we will check for any duplicate timestamps in the 'Date' column to identify inconsistencies.
# Then, if any duplicates are found, we will handle them appropriately.

# Checking for duplicate timestamps in the 'Date' column
duplicate_timestamps = df[df.duplicated(subset=['Date'], keep=False)]

duplicate_timestamps_count = duplicate_timestamps.shape[0]
duplicate_timestamps_count, duplicate_timestamps.head()

### Standardization:

In [None]:
from sklearn.preprocessing import StandardScaler

# Selecting relevant features for normalization/standardization
features_to_scale = ['Energy_Consumption_kWh', 'HVAC_Usage_kWh', 'Kitchen_Usage_kWh', 'Electronics_Usage_kWh']

# Initializing the StandardScaler
scaler = StandardScaler()

# Applying standardization (mean=0, variance=1)
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

# Displaying the first few rows to verify the scaling
df.head()

In [None]:
df.describe().T

# Week 2: Exploratory Data Analysis (EDA) & Visualization

#### Univariate Analysis

In [None]:
#selecting numerical and categorical columns for univariate analysis
numerical_cols=df.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_cols=df.select_dtypes(include=['object']).columns.tolist()

In [None]:
#Define a function to create histogram and box plots for numerical columns
def plot_numerical_univariate(data,col_list):
    fig,ax=plt.subplots(len(col_list),2,figsize=(12,4*len(col_list)))
    for i,col in enumerate(col_list):
        sns.histplot(data[col],kde=True,ax=ax[i,0])
        ax[i,0].set_title(f'Histogram of {col}',fontsize=14)
        sns.boxplot(x=data[col],ax=ax[i,1])
        ax[i,1].set_title(f'Boxplot of {col}',fontsize=14)
    plt.tight_layout()
    plt.show()

In [None]:
plot_numerical_univariate(df,numerical_cols)

In [None]:
#Define a function to create bar charts for categorical columns
def plot_categorical_univariate(data,col_list):
    fig,ax=plt.subplots(len(col_list),1,figsize=(12,4*len(col_list)))
    for i,col in enumerate(col_list):
        value_counts=data[col].value_counts()
        sns.barplot(x=value_counts.index,y=value_counts,ax=ax[i])
        ax[i].set_title(f'Bar Chart of {col}',fontsize=14)
        ax[i].tick_params(axis='x',rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
plot_categorical_univariate(df,categorical_cols)

#### Bivariate and Multivariate Analysis

In [None]:

# Define the variables of interest for scatter plots with energy consumption
variables_of_interest = ['Temperature_C', 'Humidity_%', 'HVAC_Usage_kWh', 'Kitchen_Usage_kWh', 'Electronics_Usage_kWh', 'Occupancy']

# Creating scatter plots for Energy Consumption vs. each of the other variables
fig, axes = plt.subplots(3, 2, figsize=(12, 18))  # Adjust subplot layout for better visual organization
axes = axes.flatten()  # Flatten the array to ease indexing

for i, var in enumerate(variables_of_interest):
    sns.scatterplot(data=df, x=var, y='Energy_Consumption_kWh', ax=axes[i])
    axes[i].set_title(f'Energy Consumption vs. {var}')

plt.tight_layout()
plt.show()

#### Multivariate Analysis

In [None]:
# Selecting relevant features for correlation analysis
features = ['Energy_Consumption_kWh', 'Temperature_C', 'Humidity_%', 'HVAC_Usage_kWh', 'Kitchen_Usage_kWh', 'Electronics_Usage_kWh', 'Occupancy']

# Calculating the correlation matrix
corr_matrix = df[features].corr()

# Creating a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix of Energy Consumption and Related Variables')
plt.show()

#### Pair plot

In [None]:
# Define the list of features to include in the pair plot
features = ['Energy_Consumption_kWh', 'Temperature_C', 'Humidity_%', 'HVAC_Usage_kWh', 'Kitchen_Usage_kWh', 'Electronics_Usage_kWh', 'Occupancy']


# Creating a pair plot with a kernel density estimate on the diagonal
sns.pairplot(df[features])

# Adjust the plot titles and layout
plt.subplots_adjust(top=0.95)
plt.suptitle('Pair Plot of Energy Consumption and Related Variables', size=16)

# Show the plot
plt.show()

#### Time Series Analysis

In [None]:
# First, check if 'Date' is already a column, if not, set the index name and reset it to a column
if 'Date' not in df.columns:
    df.index.names = ['Date']
    df.reset_index(inplace=True)

# Convert 'Date' to datetime type if it's not already
df['Date'] = pd.to_datetime(df['Date'])

# Set 'Date' as the index
df.set_index('Date', inplace=True)

In [None]:
# Now, resample the 'Energy_Consumption_kWh' column by day and compute the mean
daily_energy = df['Energy_Consumption_kWh'].resample('D').mean()

# Plotting
import matplotlib.pyplot as plt

plt.figure(figsize=(15, 7))
plt.plot(daily_energy, label='Daily Energy Consumption')
plt.plot(daily_energy.rolling(window=7).mean(), color='red', label='7-Day Moving Average')
plt.title('Daily Energy Consumption Trends')
plt.xlabel('Date')
plt.ylabel('Energy Consumption (kWh)')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
df.head()

#### Decompose the time series

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

# Decompose the daily energy consumption time series
result = seasonal_decompose(daily_energy.dropna(), model='additive')  # use 'multiplicative' if it fits better
result.plot()
plt.show()

#### Feature Engineering

In [None]:
df['Energy per Occupant'] = df['Energy_Consumption_kWh'] / df['Occupancy'].replace(0, 1)  # Replace 0 with 1 to avoid division by zero
df['Day of Week'] = df.index.dayofweek
df['Is Weekend'] = df['Day of Week'].apply(lambda x: 1 if x >= 5 else 0)
df['Season'] = df.index.month % 12 // 3 + 1
df['Season'] = df['Season'].map({1: 'Winter', 2: 'Spring', 3: 'Summer', 4: 'Autumn'})
df['HVAC Efficiency'] = df['HVAC_Usage_kWh'] / df['Energy_Consumption_kWh']

In [None]:
df.head()

In [None]:
# Only select numeric columns for correlation analysis
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
correlations = df[numeric_cols].corr()

# Visualize the correlation matrix
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))

sns.heatmap(correlations, annot=True, cmap='coolwarm')
plt.show()

#### Advanced Visualizations

In [None]:
# Creating joint plots for Energy Consumption vs. HVAC Usage
sns.jointplot(x='Energy_Consumption_kWh', y='HVAC_Usage_kWh', data=df, kind='scatter')
plt.show()

# Energy Consumption vs. Temperature
sns.jointplot(x='Energy_Consumption_kWh', y='Temperature_C', data=df, kind='scatter')
plt.show()

# Energy Consumption vs. Energy per Occupant
sns.jointplot(x='Energy_Consumption_kWh', y='Energy per Occupant', data=df, kind='scatter')
plt.show()

In [None]:
#selecting numerical and categorical columns for univariate analysis
numeric_columns=df.select_dtypes(include=['int64','float64']).columns.tolist()


In [None]:
# Create pairwise joint plots for numeric variables with histogram on the diagonal
sns.pairplot(df[numeric_columns], diag_kind='kde', markers='o')
plt.suptitle('Pairwise Joint Plots for Numeric Variables', y=1.02, fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(15, 10))

# Set the number of columns for the grid layout
num_cols = 3  
num_vars = len(numeric_columns)  # Number of numeric columns
num_rows = (num_vars + num_cols - 1) // num_cols  # Calculate required rows

# Loop through the numeric columns and create violin plots
for i, column in enumerate(numeric_columns):
    plt.subplot(num_rows, num_cols, i + 1)  # Arrange plots in a flexible grid
    sns.violinplot(y=df[column], x=df['Season'], palette="muted")
    plt.title(f'Violin Plot of {column} by Season', fontsize=12)
    plt.xlabel('Season', fontsize=10)
    plt.ylabel(column, fontsize=10)

plt.tight_layout()
plt.show()

#### Interactive Plot

In [None]:
import plotly.express as px
for column in numeric_columns:
    if column != 'Energy_Consumption_kWh':
        fig = px.scatter(df, 
                         x='Energy_Consumption_kWh', 
                         y=column,
                         hover_data=df.columns,  # Show all columns on hover
                         title=f'Interactive Scatter Plot of Energy Consumption vs {column}',
                         labels={'Energy_Consumption_kWh': 'Energy Consumption (kWh)', column: column})
        fig.show()

In [None]:
# Create interactive violin plots for energy consumption by season
fig = px.violin(df, 
                 y='Energy_Consumption_kWh', 
                 x='Season',
                 box=True, 
                 points='all',
                 title='Interactive Violin Plot of Energy Consumption by Season',
                 labels={'Season': 'Season', 'Energy_Consumption_kWh': 'Energy Consumption (kWh)'})
fig.show()



In [None]:
# Optionally, you can create violin plots for other numeric variables by season
for column in numeric_columns:
    if column != 'Energy_Consumption_kWh':
        fig = px.violin(df, 
                         y=column, 
                         x='Season',
                         box=True, 
                         points='all',
                         title=f'Interactive Violin Plot of {column} by Season',
                         labels={'Season': 'Season', column: column})
        fig.show()

## Task 4: Data Split

In [None]:

# First, sort the DataFrame by the date index if it's not already sorted
df = df.sort_index()  # Ensure the data is sorted by date

# Calculate the split index
split_index = int(len(df) * 0.8)

# Create training and testing datasets
train_df = df.iloc[:split_index]  # First 80% for training
test_df = df.iloc[split_index:]    # Remaining 20% for testing

# Display the sizes of the datasets
print(f'Training set size: {train_df.shape[0]}')
print(f'Testing set size: {test_df.shape[0]}')


In [None]:
train_df.head()

In [None]:
test_df.head()

#### Task 2- Model Selection and Training

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# Ensure the DataFrame is sorted by date
df = df.sort_index()  

# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# Convert categorical columns to numeric using one-hot encoding
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Define features (X) and target variable (y)
X = df_encoded.drop(columns=['Energy_Consumption_kWh'])  # Update the target variable as needed
y = df_encoded['Energy_Consumption_kWh']  # Update the target variable as needed

# Split the data into training and testing sets while maintaining time series
split_index = int(len(df_encoded) * 0.8)
X_train = X.iloc[:split_index]
y_train = y.iloc[:split_index]
X_test = X.iloc[split_index:]
y_test = y.iloc[split_index:]

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

In [None]:
# Residual Analysis
residuals = y_test - y_pred

# Plotting Residuals
plt.figure(figsize=(14, 6))

# Histogram of Residuals
plt.subplot(1, 2, 1)
sns.histplot(residuals, bins=30, kde=True)
plt.title('Distribution of Residuals')
plt.xlabel('Residuals')
plt.ylabel('Frequency')

# Residuals vs. Predicted Values
plt.subplot(1, 2, 2)
plt.scatter(y_pred, residuals)
plt.axhline(0, color='red', linestyle='--')
plt.title('Residuals vs. Predicted Values')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')

plt.tight_layout()
plt.show()

#### Task 4 Coefficient Extraction

In [None]:
# Get feature names
feature_names = X_train.columns

# Extract coefficients
coefficients = model.coef_

# Create a DataFrame for better visualization
coefficients_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# Sort coefficients by absolute value to identify the most impactful features
coefficients_df['Absolute Coefficient'] = coefficients_df['Coefficient'].abs()
coefficients_df = coefficients_df.sort_values(by='Absolute Coefficient', ascending=False)

print(coefficients_df[['Feature', 'Coefficient']])

#### Feature Importance Plot



In [None]:
# Add Absolute Coefficient for better visibility
coefficients_df['Absolute Coefficient'] = coefficients_df['Coefficient'].abs()

# Sort the DataFrame by absolute coefficient values
coefficients_df = coefficients_df.sort_values(by='Absolute Coefficient', ascending=False)

# Limit to top N features for clarity, if needed
top_n = 10  # You can change this to show more or fewer features
top_features = coefficients_df.head(top_n)

# Plotting the coefficients
plt.figure(figsize=(12, 6))
sns.barplot(data=top_features, x='Absolute Coefficient', y='Feature', palette='viridis')
plt.title('Top Features by Absolute Coefficient Value')
plt.xlabel('Absolute Coefficient Value')
plt.ylabel('Feature')
plt.axvline(0, color='red', linestyle='--')  # Add a vertical line at zero
plt.xscale('log')  # Use a logarithmic scale for better visibility of small coefficients
plt.show()

#### Task 5: Predictive System and Testing

In [None]:
# Assuming 'model' is the trained Linear Regression model
# and 'X_test' is the test set of features

# Make predictions on the test set
predictions = model.predict(X_test)

In [None]:
# Comparing Predictions with Actual Values
# Create a DataFrame for actual and predicted values
results_df = pd.DataFrame({
    'Actual Values': y_test,
    'Predicted Values': predictions,
    'Residuals': y_test - predictions
})

# Display the first few rows of the results
results_df.head()

In [None]:
plt.figure(figsize=(12, 6))

# Scatter plot of actual vs predicted values
plt.subplot(1, 2, 1)
plt.scatter(y_test, predictions, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--')  # Line for perfect predictions
plt.title('Actual vs. Predicted Values')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')

# Residuals plot
plt.subplot(1, 2, 2)
plt.scatter(predictions, results_df['Residuals'], alpha=0.6)
plt.axhline(0, color='red', linestyle='--')  # Line at zero residual
plt.title('Residuals vs. Predicted Values')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')

plt.tight_layout()
plt.show()