### Perform various data preprocessing techniques like handling missing data and feature scaling.

In [34]:
import pandas as pd

# load the CSV into a DataFrame
df = pd.read_csv("Automobile - Automobile.csv")

# check the dataset
print("Shape of dataset:", df.shape)
display(df.head())


Shape of dataset: (398, 9)


Unnamed: 0,name,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,chevrolet chevelle malibu,18.0,8.0,307.0,130.0,3504.0,12.0,70,usa
1,buick skylark 320,15.0,8.0,350.0,165.0,3693.0,11.5,70,usa
2,plymouth satellite,18.0,8.0,318.0,150.0,3436.0,11.0,70,usa
3,amc rebel sst,16.0,8.0,304.0,150.0,3433.0,12.0,70,usa
4,ford torino,17.0,,302.0,140.0,3449.0,10.5,70,usa


In [35]:
# Check for missing values
print("Missing values per column:\n", df.isnull().sum())

# Optionally, show rows with missing values
display(df[df.isnull().any(axis=1)])


Missing values per column:
 name             0
mpg              0
cylinders        3
displacement     3
horsepower      12
weight           2
acceleration     3
model_year       0
origin           0
dtype: int64


Unnamed: 0,name,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
4,ford torino,17.0,,302.0,140.0,3449.0,10.5,70,usa
6,chevrolet impala,14.0,8.0,454.0,220.0,,9.0,70,usa
8,pontiac catalina,14.0,8.0,455.0,,4425.0,10.0,70,usa
12,chevrolet monte carlo,15.0,,400.0,150.0,3761.0,9.5,70,usa
15,plymouth duster,22.0,6.0,198.0,95.0,2833.0,,70,usa
19,volkswagen 1131 deluxe sedan,26.0,4.0,97.0,,1835.0,20.5,70,europe
23,bmw 2002,26.0,4.0,121.0,113.0,2234.0,,70,europe
32,ford pinto,25.0,4.0,98.0,,2046.0,19.0,71,usa
126,ford maverick,21.0,6.0,200.0,,2875.0,17.0,74,usa
127,amc hornet,19.0,6.0,232.0,,2901.0,16.0,74,usa


In [36]:
# Fill numerical missing values with column mean
num_cols = df.select_dtypes(include='number').columns
df[num_cols] = df[num_cols].fillna(df[num_cols].mean())

# For categorical columns (if any missing), fill with mode
cat_cols = df.select_dtypes(include='object').columns
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])

# Check again for missing values
print("Missing values after filling:\n", df.isnull().sum())


Missing values after filling:
 name            0
mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64


In [37]:
# Save cleaned DataFrame to a new CSV
cleaned_file = "Automobile_Cleaned.csv"
df.to_csv(cleaned_file, index=False)
print(f"Cleaned CSV saved as {cleaned_file}")

# Optional: download the cleaned CSV to your computer
from google.colab import files
files.download(cleaned_file)


Cleaned CSV saved as Automobile_Cleaned.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#### step 1: Start by importing the necessary Python libraries for data preprocessing.


In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler

#### Step 2: Load the placement dataset into a Pandas Dataframe.

In [None]:
df=pd.read_csv("data.csv")
df.head(10)

Unnamed: 0,sl_no,gender,hsc_p,hsc_s,degree_p,degree_t,etest_p,specialisation,mba_p,salary
0,1,M,91.0,Commerce,58.0,Sci&Tech,55.0,Mkt&HR,58.8,270000.0
1,2,M,78.33,Science,77.48,Sci&Tech,86.5,Mkt&Fin,66.28,200000.0
2,3,M,,Arts,64.0,Comm&Mgmt,75.0,Mkt&Fin,57.8,250000.0
3,4,M,52.0,Science,,Sci&Tech,66.0,Mkt&HR,59.43,
4,5,M,73.6,Commerce,73.3,Comm&Mgmt,96.8,Mkt&Fin,55.5,425000.0
5,6,M,49.8,Science,67.25,Sci&Tech,55.0,Mkt&Fin,51.58,
6,7,F,49.2,Commerce,79.0,Comm&Mgmt,74.28,Mkt&Fin,53.29,
7,8,M,64.0,Science,66.0,Sci&Tech,67.0,Mkt&Fin,62.14,252000.0
8,9,M,79.0,Commerce,72.0,Comm&Mgmt,91.34,Mkt&Fin,61.29,231000.0
9,10,M,70.0,Commerce,61.0,Comm&Mgmt,54.0,Mkt&Fin,52.21,


#### Step 3:Take a quick look at the data to understand its structure and identify any missing values or anomalies.

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sl_no           215 non-null    int64  
 1   gender          215 non-null    object 
 2   hsc_p           210 non-null    float64
 3   hsc_s           215 non-null    object 
 4   degree_p        213 non-null    float64
 5   degree_t        215 non-null    object 
 6   etest_p         211 non-null    float64
 7   specialisation  215 non-null    object 
 8   mba_p           214 non-null    float64
 9   salary          148 non-null    float64
dtypes: float64(5), int64(1), object(4)
memory usage: 16.9+ KB


#### The method isnull() checks each element in the DataFrame (or Series) to see if it is NaN (Not a Number) or None (missing value).
It returns a DataFrame (or Series) of the same shape as the input, with Boolean values:
#### True: The value is null (NaN or None).
#### False: The value is not null.

In [None]:
df.isnull().sum()

sl_no              0
gender             0
hsc_p              5
hsc_s              0
degree_p           2
degree_t           0
etest_p            4
specialisation     0
mba_p              1
salary            67
dtype: int64

#### Step 4: Handle Missing Data
#### Option 1: If the dataset is large and only a small percentage of data is missing, you can remove rows with missing values.


In [None]:
df.dropna(subset=["salary"],inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 148 entries, 0 to 213
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sl_no           148 non-null    int64  
 1   gender          148 non-null    object 
 2   hsc_p           146 non-null    float64
 3   hsc_s           148 non-null    object 
 4   degree_p        147 non-null    float64
 5   degree_t        148 non-null    object 
 6   etest_p         146 non-null    float64
 7   specialisation  148 non-null    object 
 8   mba_p           148 non-null    float64
 9   salary          148 non-null    float64
dtypes: float64(5), int64(1), object(4)
memory usage: 12.7+ KB


#### Option 2:If removing data isn't ideal, you can impute (fill in) missing values using methods like mean, median, or most frequent.

In [None]:
df["hsc_p"].fillna(df["hsc_p"].mean(),inplace=True)
df["degree_p"].fillna(df["degree_p"].mean(),inplace=True)
df["etest_p"].fillna(df["etest_p"].mean(),inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 148 entries, 0 to 213
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sl_no           148 non-null    int64  
 1   gender          148 non-null    object 
 2   hsc_p           148 non-null    float64
 3   hsc_s           148 non-null    object 
 4   degree_p        148 non-null    float64
 5   degree_t        148 non-null    object 
 6   etest_p         148 non-null    float64
 7   specialisation  148 non-null    object 
 8   mba_p           148 non-null    float64
 9   salary          148 non-null    float64
dtypes: float64(5), int64(1), object(4)
memory usage: 12.7+ KB


#### Step 5: Feature Scaling


<img src="https://i.postimg.cc/G21gMYnF/f.png" alt="Image Description" width="500">









 Option 1: This method scales the data to have a mean of 0 and a standard deviation of 1.


In [None]:
nc=["hsc_p","degree_p","etest_p","salary"]
SC1=StandardScaler()
df[nc]=SC1.fit_transform(df[nc])
df.head(6)




Unnamed: 0,sl_no,gender,hsc_p,hsc_s,degree_p,degree_t,etest_p,specialisation,mba_p,salary
0,1,M,2.265997,Commerce,-1.652293,Sci&Tech,-1.328518,Mkt&HR,58.8,-0.200292
1,2,M,0.8987875,Science,1.346845,Sci&Tech,0.978332,Mkt&Fin,66.28,-0.951839
2,3,M,1.533482e-15,Arts,-0.728534,Comm&Mgmt,0.136149,Mkt&Fin,57.8,-0.415019
4,5,M,0.388377,Commerce,0.703293,Comm&Mgmt,1.732635,Mkt&Fin,55.5,1.463849
7,8,M,-0.6475513,Science,-0.420614,Sci&Tech,-0.449718,Mkt&Fin,62.14,-0.393547
8,9,M,0.9710867,Commerce,0.503145,Comm&Mgmt,1.332781,Mkt&Fin,61.29,-0.619011


#### Option 2:This method scales the data to a fixed range, usually between 0 and 1.

In [None]:
SC2=MinMaxScaler()
df[nc]=SC2.fit_transform(df[nc])
df.head()

Unnamed: 0,sl_no,gender,hsc_p,hsc_s,degree_p,degree_t,etest_p,specialisation,mba_p,salary
0,1,M,0.857051,Commerce,0.057143,Sci&Tech,0.104167,Mkt&HR,58.8,0.094595
1,2,M,0.586729,Science,0.613714,Sci&Tech,0.760417,Mkt&Fin,66.28,0.0
2,3,M,0.409023,Arts,0.228571,Comm&Mgmt,0.520833,Mkt&Fin,57.8,0.067568
4,5,M,0.485812,Commerce,0.494286,Comm&Mgmt,0.975,Mkt&Fin,55.5,0.304054
7,8,M,0.28099,Science,0.285714,Sci&Tech,0.354167,Mkt&Fin,62.14,0.07027


####  Step 6:Separate the dataset into features (X) and target (y) variables. The target is usually the column you want to predict.

In [None]:
X=df[["gender","hsc_p","hsc_s","degree_p","degree_t","etest_p","specialisation","mba_p"]]
Y=df["salary"]


### Step 7: After preprocessing, save the cleaned and scaled dataset to a new CSV file


In [None]:
final=pd.concat([X,Y],axis=1)
final.to_csv("Pre.csv",index=False)


In [None]:
# Lab-1 Activities

#Perform data preprocesing for Automobile.csv

#i. Delete the column horsepower since it has few missing values

#ii. Impute missing with median

#iii. Apply min-max scaling and standardization on the Automobiles.csv and provide the reasoning which feature scaling method make more sense to this dataset.