In [1]:
import pandas

In [None]:
# ===================================================== Step 2 =====================================================
pandas.read_csv("salary.csv")

Unnamed: 0,City,Age,Salary,Purchased
0,Hyderabad,44.0,72000.0,No
1,Mumbai,27.0,48000.0,Yes
2,Delhi,30.0,54000.0,No
3,Mumbai,38.0,61000.0,No
4,Delhi,40.0,,Yes
5,Hyderabad,35.0,58000.0,Yes
6,Mumbai,,52000.0,No
7,Hyderabad,48.0,79000.0,Yes
8,Delhi,50.0,83000.0,No
9,Hyderabad,37.0,67000.0,Yes


In [None]:
df = pandas.read_csv("salary.csv")
X = df.iloc[:, 0:3]  # X, columns 0, 1, 2 (City, Age, Salary)
Y = df.iloc[:, 3]    # Y, column 3 (Purchased)

print("X (features):")
print(X)
print("\nY (target):")
print(Y)

X (features):
        City   Age   Salary
0  Hyderabad  44.0  72000.0
1     Mumbai  27.0  48000.0
2      Delhi  30.0  54000.0
3     Mumbai  38.0  61000.0
4      Delhi  40.0      NaN
5  Hyderabad  35.0  58000.0
6     Mumbai   NaN  52000.0
7  Hyderabad  48.0  79000.0
8      Delhi  50.0  83000.0
9  Hyderabad  37.0  67000.0

Y (target):
0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object


In [None]:
# ===================================================== Step 3 =====================================================

# Replace NaN values with column averages
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Salary'] = df['Salary'].fillna(df['Salary'].mean())

X = df.iloc[:, 0:3]  # X, columns 0, 1, 2 (City, Age, Salary)
Y = df.iloc[:, 3]    # Y, column 3 (Purchased)

print("X (features) after replacing NaN with averages:")
print(X)
print("\nY (target):")
print(Y)

X (features) after replacing NaN with averages:
        City        Age        Salary
0  Hyderabad  44.000000  72000.000000
1     Mumbai  27.000000  48000.000000
2      Delhi  30.000000  54000.000000
3     Mumbai  38.000000  61000.000000
4      Delhi  40.000000  63777.777778
5  Hyderabad  35.000000  58000.000000
6     Mumbai  38.777778  52000.000000
7  Hyderabad  48.000000  79000.000000
8      Delhi  50.000000  83000.000000
9  Hyderabad  37.000000  67000.000000

Y (target):
0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object


In [None]:
# ===================================================== Step 4 =====================================================
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

df_onehot_sklearn = df.copy()

# Apply to column 0 (City)
ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(), [0])],
    remainder='passthrough'
)

X_transformed = ct.fit_transform(df_onehot_sklearn)

# Convert to DataFrame for better visualization
# Get feature names
feature_names = ['City_Delhi', 'City_Hyderabad', 'City_Mumbai', 'Age', 'Salary', 'Purchased']
df_onehot_sklearn_result = pandas.DataFrame(X_transformed, columns=feature_names)

# Convert one-hot encoded columns to boolean (True/False)
city_columns = ['City_Delhi', 'City_Hyderabad', 'City_Mumbai']
df_onehot_sklearn_result[city_columns] = df_onehot_sklearn_result[city_columns].astype(int)


print("One-Hot Encoded using scikit-learn OneHotEncoder:")
print(df_onehot_sklearn_result.head(10))

One-Hot Encoded using scikit-learn OneHotEncoder:
   City_Delhi  City_Hyderabad  City_Mumbai        Age        Salary Purchased
0           0               1            0       44.0       72000.0        No
1           0               0            1       27.0       48000.0       Yes
2           1               0            0       30.0       54000.0        No
3           0               0            1       38.0       61000.0        No
4           1               0            0       40.0  63777.777778       Yes
5           0               1            0       35.0       58000.0       Yes
6           0               0            1  38.777778       52000.0        No
7           0               1            0       48.0       79000.0       Yes
8           1               0            0       50.0       83000.0        No
9           0               1            0       37.0       67000.0       Yes


In [24]:
df_onehot_pandas = df.copy()

# Apply get_dummies to the City column
df_onehot_pandas = pandas.get_dummies(df_onehot_pandas, columns=['City'], prefix='City', dtype=int)

print("One-Hot Encoded using pandas get_dummies:")
print(df_onehot_pandas.head(10))

One-Hot Encoded using pandas get_dummies:
         Age        Salary Purchased  City_Delhi  City_Hyderabad  City_Mumbai
0  44.000000  72000.000000        No           0               1            0
1  27.000000  48000.000000       Yes           0               0            1
2  30.000000  54000.000000        No           1               0            0
3  38.000000  61000.000000        No           0               0            1
4  40.000000  63777.777778       Yes           1               0            0
5  35.000000  58000.000000       Yes           0               1            0
6  38.777778  52000.000000        No           0               0            1
7  48.000000  79000.000000       Yes           0               1            0
8  50.000000  83000.000000        No           1               0            0
9  37.000000  67000.000000       Yes           0               1            0


In [26]:
# Using LabelEncoder for binary classification (Yes/No -> 1/0)
label_encoder_purchased = LabelEncoder()
df_onehot_pandas['Purchased_Encoded'] = label_encoder_purchased.fit_transform(df_onehot_pandas['Purchased'])

print("Label Encoded Purchased column:")
print(df_onehot_pandas[['Purchased', 'Purchased_Encoded']])
print("\nMapping: No=0, Yes=1")

Label Encoded Purchased column:
  Purchased  Purchased_Encoded
0        No                  0
1       Yes                  1
2        No                  0
3        No                  0
4       Yes                  1
5       Yes                  1
6        No                  0
7       Yes                  1
8        No                  0
9       Yes                  1

Mapping: No=0, Yes=1


In [27]:
# Prepare final X and Y
X_final = df_onehot_pandas[['Age', 'Salary', 'City_Delhi', 'City_Hyderabad', 'City_Mumbai']]
Y_final = df_onehot_pandas['Purchased_Encoded']

print("Final X (features):")
print(X_final)
print("\nFinal Y (target):")
print(Y_final)
print("\nX shape:", X_final.shape)
print("Y shape:", Y_final.shape)

Final X (features):
         Age        Salary  City_Delhi  City_Hyderabad  City_Mumbai
0  44.000000  72000.000000           0               1            0
1  27.000000  48000.000000           0               0            1
2  30.000000  54000.000000           1               0            0
3  38.000000  61000.000000           0               0            1
4  40.000000  63777.777778           1               0            0
5  35.000000  58000.000000           0               1            0
6  38.777778  52000.000000           0               0            1
7  48.000000  79000.000000           0               1            0
8  50.000000  83000.000000           1               0            0
9  37.000000  67000.000000           0               1            0

Final Y (target):
0    0
1    1
2    0
3    0
4    1
5    1
6    0
7    1
8    0
9    1
Name: Purchased_Encoded, dtype: int64

X shape: (10, 5)
Y shape: (10,)


In [36]:
# ===================================================== Step 5 =====================================================
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Split Using 80-20 split ratio
X_train, X_test, Y_train, Y_test = train_test_split(
    X_final, Y_final, test_size=0.2, random_state=88
)

print("Dataset Split Results:")
print(f"Total samples: {len(X_final)}")
print(f"Training samples: {len(X_train)} ({len(X_train)/len(X_final)*100:.0f}%)")
print(f"Testing samples: {len(X_test)} ({len(X_test)/len(X_final)*100:.0f}%)")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"Y_train shape: {Y_train.shape}")
print(f"Y_test shape: {Y_test.shape}")

Dataset Split Results:
Total samples: 10
Training samples: 8 (80%)
Testing samples: 2 (20%)
X_train shape: (8, 5)
X_test shape: (2, 5)
Y_train shape: (8,)
Y_test shape: (2,)


In [None]:
# Normalization using Min-Max Scaler
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

# Visualize
X_train_norm_df = pandas.DataFrame(X_train_normalized, columns=X_train.columns, index=X_train.index)
X_test_norm_df = pandas.DataFrame(X_test_normalized, columns=X_test.columns, index=X_test.index)

print("\nOriginal Training Data:")
print(X_train)
print("\nNormalized Training Data:")
print(X_train_norm_df)


Original Training Data:
         Age        Salary  City_Delhi  City_Hyderabad  City_Mumbai
5  35.000000  58000.000000           0               1            0
6  38.777778  52000.000000           0               0            1
4  40.000000  63777.777778           1               0            0
2  30.000000  54000.000000           1               0            0
1  27.000000  48000.000000           0               0            1
7  48.000000  79000.000000           0               1            0
0  44.000000  72000.000000           0               1            0
8  50.000000  83000.000000           1               0            0

Normalized Training Data:
        Age    Salary  City_Delhi  City_Hyderabad  City_Mumbai
5  0.347826  0.285714         0.0             1.0          0.0
6  0.512077  0.114286         0.0             0.0          1.0
4  0.565217  0.450794         1.0             0.0          0.0
2  0.130435  0.171429         1.0             0.0          0.0
1  0.000000  0.000000

In [40]:
# ===================================================== Step 6 =====================================================
from sklearn.preprocessing import StandardScaler

# Apply StandardScaler
standard_scaler = StandardScaler()
X_train_standardized = standard_scaler.fit_transform(X_train)
X_test_standardized = standard_scaler.transform(X_test)

# Convert back to DataFrame for better visualization
X_train_std_df = pandas.DataFrame(X_train_standardized, columns=X_train.columns, index=X_train.index)
X_test_std_df = pandas.DataFrame(X_test_standardized, columns=X_test.columns, index=X_test.index)

print("Standardized Training Data (first 5 rows):")
print(X_train_std_df)
print("\nStatistics after Standardization:")
print(f"Age mean: {X_train_std_df['Age'].mean():.6f}, std: {X_train_std_df['Age'].std():.6f}")
print(f"Salary mean: {X_train_std_df['Salary'].mean():.6f}, std: {X_train_std_df['Salary'].std():.6f}")

Standardized Training Data (first 5 rows):
        Age    Salary  City_Delhi  City_Hyderabad  City_Mumbai
5 -0.535131 -0.470080   -0.774597        1.290994    -0.577350
6 -0.041722 -0.962979   -0.774597       -0.774597     1.732051
4  0.117910  0.004564    1.290994       -0.774597    -0.577350
2 -1.188173 -0.798679    1.290994       -0.774597    -0.577350
1 -1.579997 -1.291578   -0.774597       -0.774597     1.732051
7  1.162777  1.255067   -0.774597        1.290994    -0.577350
0  0.640343  0.680018   -0.774597        1.290994    -0.577350
8  1.423993  1.583666    1.290994       -0.774597    -0.577350

Statistics after Standardization:
Age mean: 0.000000, std: 1.069045
Salary mean: 0.000000, std: 1.069045
