<a href="https://colab.research.google.com/github/AbhinavCurseOK/DNN_ANN/blob/main/DNN_auto_mpg_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

mlabhinavkumar_auto_mpg_path = kagglehub.dataset_download('mlabhinavkumar/auto-mpg')

print('Data source import complete.')


In [None]:
import pandas as pd
column_names = [
    "mpg", "cylinders", "displacement", "horsepower",
    "weight", "acceleration", "model_year", "origin", "car_name"
]
df = pd.read_csv("/kaggle/input/auto-mpg/auto-mpg.data", delim_whitespace=True, names=column_names, na_values="?")
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
import numpy as np
df.replace(r'^\s*$', np.nan, regex=True, inplace=True)

In [None]:
df.isnull().sum()

In [None]:
nan_indices = df[df['horsepower'].isnull()].index
print("Indices of rows with NaN values in 'horsepower':")
print(nan_indices)

In [None]:
nan_rows = df.loc[nan_indices]
print("Features of rows with NaN values in 'horsepower':")
nan_rows

In [None]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
features = ['cylinders', 'displacement', 'weight', 'acceleration']
df[features + ['horsepower']] = imputer.fit_transform(df[features + ['horsepower']])

In [None]:
nan_rows = df.loc[nan_indices]
print("Features of rows after replacing NaN values in 'horsepower':")
nan_rows

In [None]:
df = df.drop('car_name', axis = 1)

In [None]:
Q1 = df['horsepower'].quantile(0.25)
Q3 = df['horsepower'].quantile(0.75)
IQR = Q3 - Q1
upper_bound = Q3 + 1.5 * IQR
lower_bound = Q1 - 1.5 * IQR
outliers = df[(df['horsepower'] < lower_bound) | (df['horsepower'] > upper_bound)]['horsepower']
print("Outliers in 'horsepower':")
print(outliers)

In [None]:
numerical_features = df.select_dtypes(include=['number']).columns
for feature in numerical_features:
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1
    upper_bound = Q3 + 1.5 * IQR
    lower_bound = Q1 - 1.5 * IQR
    outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
    print(f"Feature: {feature}  ->   Number of Outliers: {len(outliers)}")

In [None]:
Q1 = df['mpg'].quantile(0.25)
Q3 = df['mpg'].quantile(0.75)
IQR = Q3 - Q1
upper_bound = Q3 + 1.5 * IQR
lower_bound = Q1 - 1.5 * IQR
df = df[(df['mpg'] >= lower_bound) & (df['mpg'] <= upper_bound)]
print("Shape of DataFrame after removing 'mpg' outliers:", df.shape)

In [None]:
df['horsepower'].describe()

In [None]:
print(df['horsepower'].median())

In [None]:
def detect_outliers_iqr_with_index(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    upper_bound = Q3 + 1.5 * IQR
    lower_bound = Q1 - 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    print(f"Outliers in '{column}':")
    for index, row in outliers.iterrows():
        print(f"Index: {index}, {column}: {row[column]}")
    return outliers.index

horsepower_outlier_indices = detect_outliers_iqr_with_index(df, 'horsepower')
acceleration_outlier_indices = detect_outliers_iqr_with_index(df, 'acceleration')

horsepower_outlier_rows = df.loc[horsepower_outlier_indices]
print("\nFeatures of rows with horsepower outliers:")
horsepower_outlier_rows

In [None]:
acceleration_outlier_rows = df.loc[acceleration_outlier_indices]
print("\nFeatures of rows with acceleration outliers:")
acceleration_outlier_rows

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model_acceleration = LinearRegression()
X_acceleration = df[['cylinders', 'displacement', 'horsepower', 'weight', 'mpg']]
y_acceleration = df['acceleration']
model_acceleration.fit(X_acceleration, y_acceleration)

model_horsepower = LinearRegression()
X_horsepower = df[['cylinders', 'displacement', 'acceleration', 'weight', 'mpg']]
y_horsepower = df['horsepower']
model_horsepower.fit(X_horsepower, y_horsepower)

for index in acceleration_outlier_indices:
    outlier_features = df.loc[index, ['cylinders', 'displacement', 'horsepower', 'weight', 'mpg']]
    outlier_features = pd.DataFrame(outlier_features.values.reshape(1, -1),
    columns=['cylinders', 'displacement', 'horsepower', 'weight', 'mpg'])
    predicted_acceleration = model_acceleration.predict(outlier_features)[0]
    df.loc[index, 'acceleration'] = predicted_acceleration

for index in horsepower_outlier_indices:
    outlier_features = df.loc[index, ['cylinders', 'displacement', 'acceleration', 'weight', 'mpg']]
    outlier_features = pd.DataFrame(outlier_features.values.reshape(1, -1),
    columns=['cylinders', 'displacement', 'acceleration', 'weight', 'mpg'])
    predicted_horsepower = model_horsepower.predict(outlier_features)[0]
    df.loc[index, 'horsepower'] = predicted_horsepower

In [None]:
acceleration_outlier_rows = df.loc[acceleration_outlier_indices]
print("\nRows with acceleration first_run_LR model to update  outliers:")
acceleration_outlier_rows

In [None]:
horsepower_outlier_rows = df.loc[horsepower_outlier_indices]
print("\nRows with horsepower first_run_LR model to update  outliers:")
horsepower_outlier_rows

In [None]:
numerical_features = df.select_dtypes(include=['number']).columns
for feature in numerical_features:
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1
    upper_bound = Q3 + 1.5 * IQR
    lower_bound = Q1 - 1.5 * IQR
    outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
    print(f"Feature: {feature}  ->   Number of Outliers: {len(outliers)}")

In [None]:
def detect_outliers_iqr_with_index(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    upper_bound = Q3 + 1.5 * IQR
    lower_bound = Q1 - 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    if len(outliers) > 0:
        median_value = data[column].median()
        data.loc[outliers.index, column] = median_value
        print(f"Outliers in '{column}' replaced with median: {median_value}")
    else:
        print(f"No outliers found in '{column}'")
detect_outliers_iqr_with_index(df, 'acceleration')

In [None]:
X = df.drop('mpg', axis=1)
y = df['mpg']

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
correlation_matrix = X.corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap of Features")
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
import tensorflow as tf

In [None]:
model_t = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [None]:
model_t.summary()

In [None]:
learning_rate = 0.0005
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

In [None]:
#Compile the model
model_t.compile(optimizer, loss='mean_squared_error', metrics=['mae', 'mse'])

In [None]:
# Train the model
model_t.fit(X_train, y_train, epochs=50, batch_size=16)

In [None]:
#Evaluate the model
loss = model_t.evaluate(X_test, y_test)
print('Mean Squared Error:', loss)

In [None]:
#Predictions
predictions = model_t.predict(X_test)

In [None]:
max_mpg = df['mpg'].max()
min_mpg = df['mpg'].min()

print("Maximum mpg:", max_mpg)
print("Minimum mpg:", min_mpg)

In [None]:
learning_rate = 0.006
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

In [None]:
model_n = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [None]:
model_n.compile(optimizer, loss='mean_squared_error', metrics=['mae', 'mse'])
model_n.fit(X_train, y_train, epochs=50, batch_size=16)

In [None]:
loss = model_n.evaluate(X_test, y_test)
print('Mean Squared Error:', loss)

In [None]:
predictions = model_n.predict(X_test)

In [None]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense
model_1 = tf.keras.models.Sequential()
model_1 = tf.keras.models.Sequential()
model_1.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
model_1.add(Dense(32, activation='relu'))
model_1.add(Dense(8, activation='relu'))
model_1.add(Dense(1))
learning_rate = 0.006
optimizer = Adam(learning_rate=learning_rate)
model_1.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mae', 'mse'])
model_1.fit(X_train, y_train, epochs=50, batch_size=16)

In [None]:
model_1.evaluate(X_test, y_test)
print('Mean Squared Error:', loss)

In [None]:
X_train.shape

In [None]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense
model_2 = tf.keras.models.Sequential()
model_2.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
model_2.add(Dense(32, activation='relu'))
model_2.add(Dense(16, activation='relu'))
model_2.add(Dense(8, activation='relu'))
model_2.add(Dense(1))
learning_rate = 0.006
optimizer = Adam(learning_rate=learning_rate)
model_2.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mae', 'mse'])
model_2.fit(X_train, y_train, epochs=50, batch_size=16)

In [None]:
model_2.evaluate(X_test, y_test)
print('Mean Squared Error:', loss)

In [None]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import RMSprop, SGD, Adadelta, Adagrad, Adamax, Nadam
model_2 = tf.keras.models.Sequential()
model_2.add(Dense(128, activation='relu',kernel_regularizer=regularizers.l2(0.006),  input_shape=(X_train.shape[1],)))
model_2.add(Dense(128, activation='relu'))
model_2.add(Dense(64, activation='relu'))
model_2.add(Dense(64, activation='relu'))
model_2.add(Dense(32, activation='relu'))
model_2.add(Dense(32, activation='relu'))
model_2.add(Dense(16, activation='relu'))
model_2.add(Dense(8, activation='relu'))
model_2.add(Dense(1))
learning_rate = 0.002
optimizer = Nadam(learning_rate=learning_rate)
#optimizer = Adam(learning_rate=learning_rate)
model_2.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mae', 'mse'])
model_2.fit(X_train, y_train, epochs=50, batch_size=16)

In [None]:
model_2.evaluate(X_test, y_test)
print('Mean Squared Error:', loss)

In [None]:
predictions = model_2.predict(X_test)

In [None]:
print("Predictions:", predictions[:5])

In [None]:
import matplotlib.pyplot as plt
import numpy as np
plt.figure(figsize=(8, 6))
plt.scatter(y_test, predictions, alpha=0.5)
plt.xlabel("Actual MPG")
plt.ylabel("Predicted MPG")
plt.title("Scatter Plot of Predictions vs. Actual Values")
predictions_1d = predictions.flatten()
plt.plot(np.unique(y_test), np.poly1d(np.polyfit(y_test, predictions_1d, 1))(np.unique(y_test)), color='red')
plt.show()

In [None]:
# 2. Residual Plot:
residuals = y_test - predictions.flatten()
plt.figure(figsize=(8, 6))
plt.scatter(predictions, residuals, alpha=0.5)
plt.xlabel("Predicted MPG")
plt.ylabel("Residuals")
plt.title("Residual Plot")
plt.axhline(y=0, color='red', linestyle='--')
plt.show()

In [None]:
#Histogram of Residuals:
plt.figure(figsize=(8, 6))
plt.hist(residuals, bins=20)
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.title("Histogram of Residuals")
plt.show()

In [None]:
results_df = pd.DataFrame({'Predicted MPG': predictions.flatten(), 'Original MPG': y_test.values})
results_df.head()