In [None]:
from google.colab import files
files.upload()

In [None]:
!pip install kaggle==1.5.12

In [None]:
!mkdir ~/.kaggle

In [None]:
!cp kaggle.json ~/.kaggle/


In [None]:
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download uciml/autompg-dataset


In [None]:
!unzip autompg-dataset.zip


Exploratory Data Analysis on UCI Auto-mpg dataset

initial inspection on dataset

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
data = pd.read_csv('auto-mpg.csv')
data.head()

In [None]:
data.describe()

In [None]:
print(data.info)

Data Cleaning

data cleaning

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
data = pd.read_csv('auto-mpg.csv')
data['horsepower'] = pd.to_numeric(data['horsepower'], errors='coerce')

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

horsepower_data = data['horsepower'].values.reshape(-1, 1)

imputer.fit(horsepower_data)
horsepower_data = imputer.transform(horsepower_data)
data['horsepower'] = horsepower_data

In [None]:
data.info()

Outlier detection

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer

# Load your dataset (replace 'auto-mpg.csv' with your actual file)
data = pd.read_csv('auto-mpg.csv')

# Select numerical features for outlier detection
numerical_features = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year']

# Handle missing values (replace '?' with NaN and impute)
data[numerical_features] = data[numerical_features].replace('?', np.nan)
data[numerical_features] = data[numerical_features].astype(float)
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
data[numerical_features] = imputer.fit_transform(data[numerical_features])

# --- Outlier Detection using Isolation Forest ---

model_iso = IsolationForest(contamination=0.05)  # Adjust contamination as needed
outliers_iso = model_iso.fit_predict(data[numerical_features])
data['outlier_iso'] = outliers_iso

# --- Visualization ---

# Box plots for each feature
plt.figure(figsize=(12, 6))
for i, feature in enumerate(numerical_features):
    plt.subplot(2, 4, i+1)
    sns.boxplot(x=data[feature])
    plt.title(feature)
plt.tight_layout()
plt.show()

# Scatter plots for pairs of features (example)
plt.scatter(data['horsepower'], data['mpg'], c=data['outlier_iso'], cmap='viridis')
plt.xlabel('Horsepower')
plt.ylabel('MPG')
plt.title('Outliers detected by Isolation Forest')
plt.colorbar()
plt.show()

# --- Print Outlier Indices ---

print("Outliers detected by Isolation Forest:")
print(np.where(data['outlier_iso'] == -1)[0])