Read the machine_temp_failure_missing.csv dataset into a Pandas DataFrame, must use PathLib. Inspect the dataset, and look for missing values if there are any. Find statistics such as mean, max, min. Rename the value column to temperature. 

In [None]:
import pandas as pd
from pathlib import Path
filepath = Path('/Users/Himanshu/Desktop/machine_temp_failure_missing.csv')
df = pd.read_csv(filepath)
missing_values_count = df.isnull().sum()
print(df)
print(missing_values_count)
print(df.value.min())
print(df.value.max())
print(df.value.mean())
df = df.rename(columns={'value':'temperature'})
print(df)




Create a new feature delta_temperature which would be temp[i+1] - temp[i] where i is in range(Index). The feature represents the difference between consecutive temperature values. 


In [None]:
import pandas as pd
from pathlib import Path
filepath = Path('/Users/Himanshu/Desktop/machine_temp_failure_missing.csv')
df = pd.read_csv(filepath)

df['next_temperature'] = df['temperature'].shift(1)
df['delta_temperature'] = df['temperature'] - df['next_temperature']
df = df.drop('next_temperature', axis=1)
print(df.head())

Generate two separate plots for temperature and delta_temperature using  matplotlib.

In [None]:
import matplotlib.pyplot as plt
plt.plot(df.index, df['temperature'])
plt.xlabel('index')
plt.ylabel('temperature')
plt.title('Temperature data')
plt.show()


plt.plot(df.index, df['delta_temperature'])
plt.xlabel('index')
plt.ylabel('delta_temperature')
plt.title('Delta_temperature data')
plt.show()

Generate a plot in which temperature values greater than 70.00 are marked as red, and those below 70.00 are marked as blue. 


In [None]:
mask = df['temperature'] > 70.00
plt.scatter(df.index, df['temperature'], c=mask, cmap='bwr')
plt.xlabel('Index')
plt.ylabel('Temperature')
plt.title('Temperature Plot (Red: >70.00, Blue: <=70.00)')
plt.show()

Write three functions read_datasets(), plot_dfs(), rmse_score() for these datasets (as discussed in 1_4 notebook).

In [None]:
from math import sqrt

def read_datasets():
    df1 = pd.read_csv('/Users/Himanshu/Desktop/machine_temp_failure_original.csv')
    df2 = pd.read_csv('/Users/Himanshu/Desktop/machine_temp_failure_missing.csv')
    return df1, df2

def plot_dfs(df1, df2):
    plt.plot(df1['timestamp'], df1['temperature'], label='Dataset 1')
    plt.plot(df2['timestamp'], df2['temperature'], label='Dataset 2')
    plt.xlabel('Timestamp')
    plt.ylabel('Temperature')
    plt.title('Temperature Datasets')
    plt.legend()
    plt.show()

def rmse_score(df1,df2):
    rmse = sqrt(mean(df1['temperature']**2))
    return rmse

Perform univariate imputation on temperature and  using Pandas (mean, ffill, bfill) and Scikit-Learn (SimpleImputer), and interpolation. Compare the RMSE score for each imputation with machine_temp_failure_original.csv  and find the method with the least RMSE score.


In [None]:
import numpy as np
from sklearn.impute import SimpleImputer
df['temperature_mean'] = df['temperature'].fillna(df['temperature'].mean())

df['temperature_ffill'] = df['temperature'].fillna(method='ffill')

df['temperature_bfill'] = df['temperature'].fillna(method='bfill')

imputer_mean = SimpleImputer(strategy='mean')
df['temperature_imputed_mean'] = imputer_mean.fit_transform(df[['temperature']])

imputer_ffill = SimpleImputer(strategy='constant', fill_value=np.nan)
df['temperature_imputed_ffill'] = imputer_ffill.fit_transform(df[['temperature']])

imputer_bfill = SimpleImputer(strategy='constant', fill_value=np.nan)
df['temperature_imputed_bfill'] = imputer_bfill.fit_transform(df[['temperature']])

df['temperature_interpolated'] = df['temperature'].interpolate()
print(df.head())


from sklearn.metrics import mean_squared_error

rmse_scores = {}

df_mean = df[['temperature', 'temperature_mean']].dropna()
rmse_mean = sqrt(mean_squared_error(df_mean['temperature'], df_mean['temperature_mean']))
rmse_scores['Mean'] = rmse_mean

df_ffill = df[['temperature', 'temperature_ffill']].dropna()
rmse_ffill = sqrt(mean_squared_error(df_ffill['temperature'], df_ffill['temperature_ffill']))
rmse_scores['Forward Fill'] = rmse_ffill

df_bfill = df[['temperature', 'temperature_bfill']].dropna()
rmse_bfill = sqrt(mean_squared_error(df_bfill['temperature'], df_bfill['temperature_bfill']))
rmse_scores['Backward Fill'] = rmse_bfill

imputer_mean = SimpleImputer(strategy='mean')
df_imputed_mean = pd.DataFrame(imputer_mean.fit_transform(df[['temperature']]), columns=['temperature_imputed_mean'])
df_imputed_mean = pd.concat([df['temperature'], df_imputed_mean], axis=1).dropna()
rmse_imputed_mean = sqrt(mean_squared_error(df_imputed_mean['temperature'], df_imputed_mean['temperature_imputed_mean']))
rmse_scores['Mean Imputer'] = rmse_imputed_mean

imputer_ffill = SimpleImputer(strategy='constant', fill_value=np.nan)
df_imputed_ffill = pd.DataFrame(imputer_ffill.fit_transform(df[['temperature']]), columns=['temperature_imputed_ffill'])
df_imputed_ffill = pd.concat([df['temperature'], df_imputed_ffill], axis=1).dropna()
rmse_imputed_ffill = sqrt(mean_squared_error(df_imputed_ffill['temperature'], df_imputed_ffill['temperature_imputed_ffill']))
rmse_scores['Forward Fill Imputer'] = rmse_imputed_ffill

imputer_bfill = SimpleImputer(strategy='constant', fill_value=np.nan)
df_imputed_bfill = pd.DataFrame(imputer_bfill.fit_transform(df[['temperature']]), columns=['temperature_imputed_bfill'])
df_imputed_bfill = pd.concat([df['temperature'], df_imputed_bfill], axis=1).dropna()
rmse_imputed_bfill = sqrt(mean_squared_error(df_imputed_bfill['temperature'], df_imputed_bfill['temperature_imputed_bfill']))
rmse_scores['Backward Fill Imputer'] = rmse_imputed_bfill

df_interpolated = df[['temperature', 'temperature_interpolated']].dropna()
rmse_interpolation = sqrt(mean_squared_error(df_interpolated['temperature'], df_interpolated['temperature_interpolated']))
rmse_scores['Interpolation'] = rmse_interpolation

method_with_least_rmse = min(rmse_scores, key=rmse_scores.get)
least_rmse_score = rmse_scores[method_with_least_rmse]

print("RMSE Scores:")
for method, rmse in rmse_scores.items():
    print(f"{method}: {rmse}")

print(f"\nMethod with the least RMSE score: {method_with_least_rmse} (RMSE: {least_rmse_score})")



Now the dataset with the least RMSE score  to check the frequency of data and resample the dataset to an hourly frequency.

In [None]:
df['timestamp'] = pd.to_datetime(df['timestamp'], format = "%d-%m-%Y %H:%M")
df.set_index('timestamp',inplace = True)
df_resampled = df.resample('H').mean()

frequency = pd.infer_freq(df_resampled.index)
print("Data frequency:", frequency)

print("\nResampled Dataset:")
print(df_resampled.head())


Draw box plot, boxen plot, lag plot and write your conclusions about outliers (in your own language). Define iqr_outliers() function for this data, and find the outliers. Compute z-score and plot z-score as discussed in the notebooks. 

In [None]:
import seaborn as sns

plt.figure(figsize=(8, 6))
sns.boxplot(data=df, y='temperature')
plt.title('Box Plot - Temperature')
plt.show()

plt.figure(figsize=(8, 6))
sns.boxenplot(data=df, y='temperature')
plt.title('Boxen Plot - Temperature')
plt.show()

plt.figure(figsize=(8, 6))
sns.lag_plot(df['temperature'])
plt.title('Lag Plot - Temperature')
plt.show()


def iqr_outliers(data):
    q1 = np.percentile(data, 25)
    q3 = np.percentile(data, 75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = data[(data < lower_bound) | (data > upper_bound)]
    return outliers

outliers_iqr = iqr_outliers(df['temperature'])
print("Outliers (IQR method):")
print(outliers_iqr)


z_scores = (df['temperature'] - df['temperature'].mean()) / df['temperature'].std()

plt.figure(figsize=(8, 6))
plt.plot(z_scores)
plt.axhline(y=0, color='r', linestyle='--')
plt.title('Z-Score Plot')
plt.xlabel('Index')
plt.ylabel('Z-Score')
plt.show()