In [33]:
# Importing required libraries and modules
import numpy as np  # For numerical computations
import matplotlib.pyplot as plt  # For data visualization (not used here but commonly used in data analysis workflows)
import pandas as pd  # For data manipulation and analysis

In [None]:
# Loading the dataset
dataset = pd.read_csv('Data.csv')  # Load the CSV file into a DataFrame
x = dataset.iloc[:, :-1].values  # Extracting independent variables (features)
y = dataset.iloc[:, -1].values  # Extracting dependent variable (target)

# Display the dataset for initial inspection
print(dataset)

In [None]:
# Handling missing values in the dataset
from sklearn.impute import SimpleImputer  # Importing the SimpleImputer class for handling missing values

# Initialize the imputer with a strategy to replace missing values with the mean of the column
imputer = SimpleImputer(missing_values=np.nan, strategy="mean") # 'mean', 'median', 'constant', 'most_frequent'
imputer.fit(x[:, 1:3])  # Fit the imputer on the columns with missing values (e.g., columns 1 and 2)
x[:, 1:3] = imputer.transform(x[:, 1:3])  # Apply the transformation to fill missing values

# Display the dataset after handling missing values
print(x)

In [None]:
# Encoding categorical independent variables
from sklearn.compose import ColumnTransformer  # For column-wise transformations
from sklearn.preprocessing import OneHotEncoder  # For one-hot encoding categorical data

# Applying one-hot encoding to the first column (categorical feature) and leaving others as they are
ct = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), [0])], remainder="passthrough") 
x = ct.fit_transform(x)  # Transform the dataset
print(x)  # Display the transformed dataset with one-hot encoded columns

In [None]:
# Encoding the dependent variable (target variable)
from sklearn.preprocessing import LabelEncoder  # For encoding target labels

le = LabelEncoder()  # Initialize the LabelEncoder
y = le.fit_transform(y)  # Encode the target variable
print(y)  # Display the encoded target variable

In [38]:
# Splitting the dataset into training and testing sets
from sklearn.model_selection import train_test_split  # For splitting datasets

# Splitting the dataset with 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [39]:
# Feature scaling for independent variables
from sklearn.preprocessing import StandardScaler  # For standardizing features

sc = StandardScaler()  # Initialize the StandardScaler
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])  # Standardize the numerical columns in the training set
X_test[:, 3:] = sc.transform(X_test[:, 3:])  # Standardize the numerical columns in the test set

# The dataset is now preprocessed and ready for further analysis or model training.