# Data Preprocessing Steps:


## Import the libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

## Import and examine the dataset
- Do we have any missing values?
- Show the dataframe sorted by State
- Show the means of numeric features by State
- Compute the min, max and mean of Salary and Age by state

Use a many cells as needed to show your work

[Dataset.csv](https://drive.google.com/file/d/1Sq7OQ-jMWFlF6Zamz5_RmIvjUt9d1THb/view?usp=sharing)

In [None]:
# Load the dataset (Ensure 'Dataset.csv' is in the same directory)
file_path = "Dataset.csv"
df = pd.read_csv(file_path)

# Display the first few rows of the dataset
df.head()


## Take Care of missing data
1.  Delete rows with missing data, or
2.  Replace missing data with mean values.

In [None]:
# Remove rows with missing values
df_cleaned = df.dropna()

# Display dataset after removing missing values
df_cleaned.head()


In [None]:
# Impute missing values using mean strategy
imputer = SimpleImputer(strategy="mean")

# Select only numeric columns for imputation (avoid categorical columns)
numeric_columns = df.select_dtypes(include=["number"]).columns
df[numeric_columns] = imputer.fit_transform(df[numeric_columns])

# Verify no missing values remain
df.isnull().sum()


## Encode Categorical data

#### Encode and display the categorical Independent Variable

In [None]:
from sklearn.preprocessing import LabelEncoder  # Import LabelEncoder

# Assuming the first column contains categorical data
labelencoder = LabelEncoder()
df.iloc[:, 0] = labelencoder.fit_transform(df.iloc[:, 0])

# Display the encoded dataset
df.head()


#### Encode and display the Dependent Variable

In [None]:
# Encoding the dependent variable (last column)
df.iloc[:, -1] = labelencoder.fit_transform(df.iloc[:, -1])

# Display dataset with encoded dependent variable
df.head()


## Split the dataset into training and testing sets

In [None]:
from sklearn.model_selection import train_test_split

# Splitting dataset into features and target variable
X = df.iloc[:, :-1].values  # Assuming last column is target
y = df.iloc[:, -1].values

# Splitting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check dataset shapes
X_train.shape, X_test.shape, y_train.shape, y_test.shape


## Feature Scaling

#### Standardized Scaling
* Scale the numerical features using standardized scaling
* Show your work

In [None]:
from sklearn.preprocessing import StandardScaler

# Applying Standard Scaling
scaler = StandardScaler()
X_train_standardized = scaler.fit_transform(X_train)
X_test_standardized = scaler.transform(X_test)

# Display scaled dataset
X_train_standardized[:5]


#### Normalized Scaling
* Scale the numerical features using normalized scaling
* Show your work

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Applying Normalization
normalizer = MinMaxScaler()
X_train_normalized = normalizer.fit_transform(X_train)
X_test_normalized = normalizer.transform(X_test)

# Display normalized dataset
X_train_normalized[:5]
