In [None]:
import pandas as pd
from pandas.plotting import lag_plot
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import seaborn as sns
from sklearn.impute import SimpleImputer
from scipy import interpolate
import seaborn as sns

Read the machine_temp_failure_missing.csv dataset into a Pandas DataFrame, must use PathLib. Inspect the dataset, and look for missing values if there are any. Find statistics such as mean, max, min. Rename the value column to temperature. 

In [None]:

filepath = Path('/Users/Himanshu/Desktop/machine_temp_failure_missing.csv')
df = pd.read_csv(filepath)
missing_values_count = df.isnull().sum()
print(df)
print(missing_values_count)
print(df.value.min())
print(df.value.max())
print(df.value.mean())
df = df.rename(columns={'value':'temperature'})
print(df)




Create a new feature delta_temperature which would be temp[i+1] - temp[i] where i is in range(Index). The feature represents the difference between consecutive temperature values. 


In [None]:


df['next_temperature'] = df['temperature'].shift(1)
df['delta_temperature'] = df['temperature'] - df['next_temperature']
df = df.drop('next_temperature', axis=1)
print(df.head())

Generate two separate plots for temperature and delta_temperature using  matplotlib.

In [None]:

plt.plot(df.index, df['temperature'])
plt.xlabel('index')
plt.ylabel('temperature')
plt.title('Temperature data')
plt.show()


plt.plot(df.index, df['delta_temperature'])
plt.xlabel('index')
plt.ylabel('delta_temperature')
plt.title('Delta_temperature data')
plt.show()

Generate a plot in which temperature values greater than 70.00 are marked as red, and those below 70.00 are marked as blue. 


In [None]:
mask = df['temperature'] > 70.00
plt.scatter(df.index, df['temperature'], c=mask, cmap='bwr')
plt.xlabel('Index')
plt.ylabel('Temperature')
plt.title('Temperature Plot (Red: >70.00, Blue: <=70.00)')
plt.show()

Write three functions read_datasets(), plot_dfs(), rmse_score() for these datasets (as discussed in 1_4 notebook).

In [None]:


def read_datasets():
    df1 = pd.read_csv('/Users/Himanshu/Desktop/machine_temp_failure_original.csv')
    df2 = pd.read_csv('/Users/Himanshu/Desktop/machine_temp_failure_missing.csv')
    return df1, df2

def plot_dfs(df1, df2):
    plt.plot(df1['timestamp'], df1['temperature'], label='Dataset 1')
    plt.plot(df2['timestamp'], df2['temperature'], label='Dataset 2')
    plt.xlabel('Timestamp')
    plt.ylabel('Temperature')
    plt.title('Temperature Datasets')
    plt.legend()
    plt.show()

def rmse_score(df1, df2, col=None):
    df_missing = df2.rename(columns={col: 'missing'})
    columns = df_missing.loc[:, 'missing':].columns.tolist()
    scores = []
    for comp_col in columns[1:]:
        rmse = np.sqrt(np.mean((df1[col] - df_missing[comp_col])**2))
        scores.append(rmse)
        print(f'RMSE for {comp_col}: {rmse}')
    return scores

Perform univariate imputation on temperature and  using Pandas (mean, ffill, bfill) and Scikit-Learn (SimpleImputer), and interpolation. Compare the RMSE score for each imputation with machine_temp_failure_original.csv  and find the method with the least RMSE score.


In [None]:


filepath1 = Path('./machine_temp_failure_original.csv')
filepath2 = Path('./machine_temp_failure_missing.csv')
df1 = pd.read_csv(filepath1, parse_dates=['timestamp'])
df2 = pd.read_csv(filepath2, parse_dates=['timestamp'])
df1=df1.rename(columns={'value':'temperature'})
df2=df2.rename(columns={'value':'temperature'})



df2['ffill'] = df2['temperature'].fillna(method='ffill')
df2['bfill'] = df2['temperature'].fillna(method='bfill')
df2['mean'] = df2['temperature'].fillna(df2['temperature'].mean())

imputer = SimpleImputer(strategy='mean')
df2['imputed'] = imputer.fit_transform(df2[['temperature']])

df2['interpolated'] = df2['temperature'].interpolate()

methods=['ffill','bfill','mean','imputed','interpolated']

rmse = rmse_score(df1, df2,'temperature')

for x in range(0,5):
    if(rmse[x]==min(rmse)):
        min_index=x

print("Method with minimum rmse is "+methods[min_index]+" which has rmse " +str(min(rmse)))



Now the dataset with the least RMSE score  to check the frequency of data and resample the dataset to an hourly frequency.

In [None]:
df1.set_index('timestamp', inplace=True)

time_diff = df.index.to_series().diff()
data_freq = time_diff.value_counts().idxmax()

print("Current data frequency:", data_freq)

df_hourly = df1.resample('H').mean()

print(df_hourly)
df1=df_hourly.copy()


Draw box plot, boxen plot, lag plot and write your conclusions about outliers (in your own language). Define iqr_outliers() function for this data, and find the outliers. Compute z-score and plot z-score as discussed in the notebooks. 

In [None]:


plt.figure(figsize=(8, 6))
sns.boxplot(data=df1, y='temperature')
plt.title('Box Plot - Temperature')
plt.show()

plt.figure(figsize=(8, 6))
sns.boxenplot(data=df1, y='temperature')
plt.title('Boxen Plot - Temperature')
plt.show()

plt.figure(figsize=(8, 6))
lag_plot(df1['temperature'])
plt.title('Lag Plot - Temperature')
plt.show()


def iqr_outliers(data):
    q1 = np.percentile(data, 25)
    q3 = np.percentile(data, 75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = data[(data < lower_bound) | (data > upper_bound)]
    return outliers

outliers_iqr = iqr_outliers(df1['temperature'])
print("Outliers (IQR method):")
print(outliers_iqr)


z_scores = (df['temperature'] - df['temperature'].mean()) / df['temperature'].std()

plt.figure(figsize=(8, 6))
plt.plot(z_scores)
plt.axhline(y=0, color='r', linestyle='--')
plt.title('Z-Score Plot')
plt.xlabel('Index')
plt.ylabel('Z-Score')
plt.show()