In [20]:
# Data Preprocessing

# Importing necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [21]:

# Importing the dataset
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values  # Features
y = dataset.iloc[:, -1].values   # Target variable

In [25]:
# Useful quick checks and attributes (for pandas DataFrame manipulation):
# dataset.shape            # tuple (n_rows, n_columns)
# dataset.columns          # column names
# dataset.dtypes           # data types per column
# dataset.isnull().sum()   # count missing values per column
dataset[dataset.isnull().any(axis=1)] # show rows with at least one missing value
# dataset['Age'] = dataset['Age'].fillna(dataset['Age'].mean()) # example to fill NaN in 'Age' with mean
# dataset.dropna(inplace=True) # example to drop rows with NaN
# dataset['YearsExperience'].value_counts()  # value distribution for 'Salary' (if categorical)
# dataset.describe(include='all')   # descriptive stats for all columns

Unnamed: 0,Country,Age,Salary,Purchased
4,Germany,40.0,,Yes
6,Spain,,52000.0,No


In [11]:
dataset.info()
dataset.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 452.0+ bytes


Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [5]:
# Missing data handling
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# X already contains one-hot encoded columns at indices 0..2, numeric columns Age and Salary are at 3 and 4
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [None]:
# Categorical data handling
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

# LabelEncoder: encodes labels to integer values (e.g., 'red'->0, 'green'->1).
# Note: using LabelEncoder on input features can introduce an artificial ordinal relationship.
# For nominal categorical features we prefer OneHotEncoder to avoid implying order.
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])  # Encode the first column to integers

# ColumnTransformer lets us apply OneHotEncoder only to column 0 and pass through the rest.
ct_X = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(categories='auto'), [0])],   # One-hot encode column 0
    remainder='passthrough'                                         # Leave the rest unchanged
)
X = ct_X.fit_transform(X)

# If target y is categorical, LabelEncoder converts classes to integers for the model
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)  # Encode target variable if categorical

In [10]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state= 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [11]:
# Displaying the preprocessed data
print("X_train:\n", X_train)
print("X_test:\n", X_test)

X_train:
 [[-1.          2.64575131 -0.77459667  0.26306757  0.12381479]
 [ 1.         -0.37796447 -0.77459667 -0.25350148  0.46175632]
 [-1.         -0.37796447  1.29099445 -1.97539832 -1.53093341]
 [-1.         -0.37796447  1.29099445  0.05261351 -1.11141978]
 [ 1.         -0.37796447 -0.77459667  1.64058505  1.7202972 ]
 [-1.         -0.37796447  1.29099445 -0.0813118  -0.16751412]
 [ 1.         -0.37796447 -0.77459667  0.95182631  0.98614835]
 [ 1.         -0.37796447 -0.77459667 -0.59788085 -0.48214934]]
X_test:
 [[-1.          2.64575131 -0.77459667 -1.45882927 -0.90166297]
 [-1.          2.64575131 -0.77459667  1.98496442  2.13981082]]
