In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv("mpg1.csv")

# Dimension
print("Shape of the dataset:", df.shape)

# Structure
print("\nColumns and Data Types:\n", df.dtypes)

# Summary
print("\nSummary:\n", df.describe(include='all'))

In [None]:
df = pd.read_csv("mpg.csv")  # Load the file
print(df.isnull().sum())  # Check missing values before filling

df.fillna(df.mean(numeric_only=True), inplace=True)  # Fill missing numeric values with mean

df.to_csv("mpg.csv", index=False)  # Save changes back to the same file

print(df.isnull().sum())  # Check again to confirm changes


In [None]:
# Plot histograms
df[['acceleration', 'weight']].hist(bins=30, figsize=(10, 5))
plt.suptitle('Histograms of Continuous Variables')
plt.show()

In [None]:
# Violin plot for a numerical column
sns.violinplot(data=df, x='horsepower')
plt.title('Violin Plot of horsepower')
plt.show()

In [None]:
#boxplot

#Display box plot before outlier treatment
sns.boxplot(x=df['horsepower'])
plt.title('Box Plot Before Outlier Treatment')
plt.show()

# Identify outliers using IQR (Interquartile Range)
Q1 = df['horsepower'].quantile(0.25)
Q3 = df['horsepower'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter the data to remove outliers
df_filtered = df[(df['horsepower'] >= lower_bound) & (df['horsepower'] <= upper_bound)]

# Display box plot after outlier treatment
sns.boxplot(x=df_filtered['horsepower'])
plt.title('Box Plot After Outlier Treatment')
plt.show()


In [None]:
#heatmap

#  Select only the numeric columns for correlation
numeric_df = df.select_dtypes(include=[float, int])

# # Calculate the correlation matrix
correlation_matrix = numeric_df.corr()

# # Create the heatmap
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Heatmap of Attribute Relationships')
plt.show()

In [60]:
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_csv("mpg.csv")

continuous_columns = ['mpg', 'horsepower', 'weight', 'acceleration']

# Initialize StandardScaler
scaler = StandardScaler()

# Standardize the continuous variables
df[continuous_columns] = scaler.fit_transform(df[continuous_columns])

# Save the standardized data to a new CSV file
df.to_csv("standardized_mpg.csv", index=False)

# Print the first few rows to check the results
print("standardized data")
print(df.head())


        mpg  cylinders  displacement  horsepower    weight  acceleration  \
0 -0.706439          8         307.0     0.00000  0.630870     -1.295498   
1 -1.090751          8         350.0     0.00000  0.854333     -1.477038   
2 -0.706439          8         318.0     0.00000  0.550470     -1.658577   
3 -0.962647          8         304.0     0.00000  0.546923     -1.295498   
4 -0.834543          8         302.0     0.95031  0.565841     -1.840117   

   model_year origin                       name  
0          70    usa  chevrolet chevelle malibu  
1          70    usa          buick skylark 320  
2          70    usa         plymouth satellite  
3          70    usa              amc rebel sst  
4          70    usa                ford torino  
