In [1]:
import pandas as pd
import numpy as np

In [2]:
# Sample dataset
data = pd.DataFrame({
    'ID': [1, 2, 3, 4, 5],
    'Category': ['A', 'B', 'A', 'C', 'B'],
    'Numerical_Value': [10, np.nan, 30, 40, 50],
    'Text_Value': ['Good', 'Bad', 'Excellent', 'Good', 'Excellent']
})

In [3]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

In [4]:
# 1. Data Cleaning
# Handle missing values
imputer = SimpleImputer(strategy='mean')
data[['Numerical_Value']] = imputer.fit_transform(data[['Numerical_Value']])
data

Unnamed: 0,ID,Category,Numerical_Value,Text_Value
0,1,A,10.0,Good
1,2,B,32.5,Bad
2,3,A,30.0,Excellent
3,4,C,40.0,Good
4,5,B,50.0,Excellent


In [5]:
# encoder = OneHotEncoder()
# new = encoder.fit(data[['Category']])
# new.categories_

# encoder = OneHotEncoder()
This method was deprecated in scikit-learn version 0.20 and removed in version 1.0. It has been replaced by get_feature_names_out.

In [7]:
# # 2. Data Transformation
# # Encode categorical variables
# encoder = OneHotEncoder()
# encoded_categorical = pd.DataFrame(encoder.fit_transform(data[['Category']]).toarray(),
#                                    columns=encoder.get_feature_names(['Category']))
# data = pd.concat([data, encoded_categorical], axis=1)
# data = data.drop(['Category'], axis=1)

In [8]:
# 2. Data Transformation
# Encode categorical variables
encoder = OneHotEncoder(sparse_output=False)  # sparse=False for toarray() to work directly
# If you're using scikit-learn >= 1.0:
encoded_categorical = pd.DataFrame(encoder.fit_transform(data[['Category']]),
                                   columns=encoder.get_feature_names_out(['Category']))
# If you're using scikit-learn < 1.0 and >= 0.20:
# encoded_categorical = pd.DataFrame(encoder.fit_transform(data[['Category']]).toarray(),
#                                    columns=encoder.get_feature_names_out(['Category']))

data = pd.concat([data, encoded_categorical], axis=1)
data = data.drop(['Category'], axis=1)
data

Unnamed: 0,ID,Numerical_Value,Text_Value,Category_A,Category_B,Category_C
0,1,10.0,Good,1.0,0.0,0.0
1,2,32.5,Bad,0.0,1.0,0.0
2,3,30.0,Excellent,1.0,0.0,0.0
3,4,40.0,Good,0.0,0.0,1.0
4,5,50.0,Excellent,0.0,1.0,0.0


In [9]:
# 3. Feature Engineering
# Extract features from text
data['Text_Length'] = data['Text_Value'].apply(len)

# Dimensionality reduction
pca = PCA(n_components=1)
data['New Feature'] = pca.fit_transform(data[['Numerical_Value', 'Text_Length']])
data

# PCA_Component - > reduce 2 feature as 1 singlw feature

Unnamed: 0,ID,Numerical_Value,Text_Value,Category_A,Category_B,Category_C,Text_Length,New Feature
0,1,10.0,Good,1.0,0.0,0.0,4,-22.571069
1,2,32.5,Bad,0.0,1.0,0.0,3,-0.247013
2,3,30.0,Excellent,1.0,0.0,0.0,9,-2.207952
3,4,40.0,Good,0.0,0.0,1.0,4,7.311964
4,5,50.0,Excellent,0.0,1.0,0.0,9,17.71407


In [10]:
# 4. Data Splitting
X = data.drop(['ID', 'Text_Value'], axis=1)
y = data['ID']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train

Unnamed: 0,Numerical_Value,Category_A,Category_B,Category_C,Text_Length,New Feature
4,50.0,0.0,1.0,0.0,9,17.71407
2,30.0,1.0,0.0,0.0,9,-2.207952
0,10.0,1.0,0.0,0.0,4,-22.571069
3,40.0,0.0,0.0,1.0,4,7.311964


In [11]:
# 5. Data Normalization
# Scale numerical features
scaler = StandardScaler()
X_train[['Numerical_Value', 'Text_Length']] = scaler.fit_transform(X_train[['Numerical_Value', 'Text_Length']])
X_train

Unnamed: 0,Numerical_Value,Category_A,Category_B,Category_C,Text_Length,New Feature
4,1.183216,0.0,1.0,0.0,1.0,17.71407
2,-0.169031,1.0,0.0,0.0,1.0,-2.207952
0,-1.521278,1.0,0.0,0.0,-1.0,-22.571069
3,0.507093,0.0,0.0,1.0,-1.0,7.311964


In [16]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

In [17]:
# Load the transportation dataset
transport_data = pd.read_csv('synthetic_data.csv')
transport_data.head()

Unnamed: 0,Date,Time,Stop/Station,Passenger_Count,Vehicle_ID,Latitude,Longitude,Temperature (°C),Precipitation (mm),Humidity (%),Age_Group,Gender,Feedback
0,2023-11-16,12:03,Johor Bahru,46,TRAIN82,3.906935,106.068464,11,3,63,18-24,Male,Driver was friendly
1,2023-07-14,05:07,Cameron Highlands,17,TRAIN65,4.227106,118.407191,3,3,74,25-40,Female,Seats were uncomfortable
2,2023-09-22,14:11,Ipoh,91,TRAIN38,6.819556,101.272984,27,1,81,40-60,Male,Delay in departure
3,2022-07-12,09:11,Penang,41,BUS245,3.627521,106.22699,1,7,98,25-40,Female,Driver was friendly
4,2023-12-09,16:59,Kuching,53,BUS958,1.418952,117.050925,15,9,71,40-60,Male,Service was excellent


In [18]:
# 1. Data Cleaning
# Handle missing values
imputer = SimpleImputer(strategy='mean')
transport_data[['Passenger_Count', 'Temperature (°C)', 'Precipitation (mm)', 'Humidity (%)']] = imputer.fit_transform(transport_data[['Passenger_Count', 'Temperature (°C)', 'Precipitation (mm)', 'Humidity (%)']])

In [19]:
transport_data = transport_data.rename(columns={'Age Group': 'Age_Group'})

In [20]:
# 2. Data Transformation
# Encode categorical variables
encoder = OneHotEncoder()
encoded_categorical = pd.DataFrame(encoder.fit_transform(transport_data[['Age_Group', 'Gender']]).toarray(), columns=encoder.get_feature_names_out(['Age_Group', 'Gender']))
transport_data = pd.concat([transport_data, encoded_categorical], axis=1)
transport_data = transport_data.drop(['Age_Group', 'Gender'], axis=1)

In [21]:
transport_data

Unnamed: 0,Date,Time,Stop/Station,Passenger_Count,Vehicle_ID,Latitude,Longitude,Temperature (°C),Precipitation (mm),Humidity (%),Feedback,Age_Group_18-24,Age_Group_25-40,Age_Group_40-60,Gender_Female,Gender_Male
0,2023-11-16,12:03,Johor Bahru,46.0,TRAIN82,3.906935,106.068464,11.0,3.0,63.0,Driver was friendly,1.0,0.0,0.0,0.0,1.0
1,2023-07-14,05:07,Cameron Highlands,17.0,TRAIN65,4.227106,118.407191,3.0,3.0,74.0,Seats were uncomfortable,0.0,1.0,0.0,1.0,0.0
2,2023-09-22,14:11,Ipoh,91.0,TRAIN38,6.819556,101.272984,27.0,1.0,81.0,Delay in departure,0.0,0.0,1.0,0.0,1.0
3,2022-07-12,09:11,Penang,41.0,BUS245,3.627521,106.226990,1.0,7.0,98.0,Driver was friendly,0.0,1.0,0.0,1.0,0.0
4,2023-12-09,16:59,Kuching,53.0,BUS958,1.418952,117.050925,15.0,9.0,71.0,Service was excellent,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2022-07-05,03:00,Kuala Lumpur,36.0,BUS992,1.618061,103.326052,-4.0,7.0,65.0,Delay in departure,0.0,1.0,0.0,0.0,1.0
996,2022-12-16,05:55,Cameron Highlands,30.0,TRAIN82,6.545632,103.185658,4.0,5.0,67.0,Train was overcrowded,0.0,1.0,0.0,0.0,1.0
997,2023-09-07,07:12,Langkawi,67.0,BUS129,5.117110,101.437689,-8.0,6.0,75.0,Bus arrived on time,0.0,0.0,1.0,0.0,1.0
998,2023-12-20,20:11,Johor Bahru,78.0,TRAIN45,1.606028,118.687141,11.0,8.0,65.0,Poor signage at the station,1.0,0.0,0.0,0.0,1.0


In [22]:
# 3. Feature Engineering
# Create new features
transport_data['Temperature_Humidity_Ratio'] = transport_data['Temperature (°C)'] / transport_data['Humidity (%)']

In [23]:
# Dimensionality reduction
pca = PCA(n_components=2)
transport_data_pca = pca.fit_transform(transport_data[['Passenger_Count', 'Temperature (°C)', 'Precipitation (mm)', 'Humidity (%)']])
transport_data['PCA_Component1'] = transport_data_pca[:, 0]
transport_data['PCA_Component2'] = transport_data_pca[:, 1]

In [24]:
transport_data_pca[:,1]

array([ 3.08252394e+00, -8.77527779e+00,  6.64978761e+00, -2.32124999e+01,
        2.36127286e+00, -2.68974353e+01,  2.59967726e+01, -1.53441281e+01,
        2.73825328e+00,  6.82058453e+00,  1.51410517e+01, -1.62343620e+01,
       -5.52695035e+00, -1.01498800e+00,  1.33023590e+01, -2.85968051e+00,
        2.80798660e+01,  1.02302966e+00,  2.00055989e+01,  4.20918551e+00,
        1.40409408e+01, -4.38819308e+00,  9.88330516e+00, -1.28283760e+01,
       -9.82558821e+00,  7.05676668e+00, -2.46876235e+01,  7.71665469e+00,
       -7.70143705e+00,  1.01837676e+01, -2.13631813e+01, -1.30727726e+01,
       -1.33673075e+01,  1.32846958e+01, -1.67987665e+01,  2.24735299e+00,
       -2.56105380e+01,  1.28375648e+01,  1.28725223e+01, -1.82415083e+01,
       -5.85715290e-01,  2.34580700e+01,  1.05073938e+01,  1.94589622e+01,
       -3.81282920e+00, -1.72655350e+01, -1.19899866e+01,  1.46336288e+01,
        2.99736191e+00,  2.33250167e+01, -2.65942877e+01, -2.30334287e+01,
       -1.13932100e+01,  

In [25]:
# 4. Data Splitting
X = transport_data.drop(['Passenger_Count'], axis=1)
y = transport_data['Passenger_Count']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
# 5. Data Normalization
# Scale numerical features
scaler = StandardScaler()
X_train[['Temperature (°C)', 'Precipitation (mm)', 'Humidity (%)', 'Temperature_Humidity_Ratio', 'PCA_Component1', 'PCA_Component2']] = scaler.fit_transform(X_train[['Temperature (°C)', 'Precipitation (mm)', 'Humidity (%)', 'Temperature_Humidity_Ratio', 'PCA_Component1', 'PCA_Component2']])
X_test[['Temperature (°C)', 'Precipitation (mm)', 'Humidity (%)', 'Temperature_Humidity_Ratio', 'PCA_Component1', 'PCA_Component2']] = scaler.transform(X_test[['Temperature (°C)', 'Precipitation (mm)', 'Humidity (%)', 'Temperature_Humidity_Ratio', 'PCA_Component1', 'PCA_Component2']])

In [27]:
X_test

Unnamed: 0,Date,Time,Stop/Station,Vehicle_ID,Latitude,Longitude,Temperature (°C),Precipitation (mm),Humidity (%),Feedback,Age_Group_18-24,Age_Group_25-40,Age_Group_40-60,Gender_Female,Gender_Male,Temperature_Humidity_Ratio,PCA_Component1,PCA_Component2
521,2023-06-03,08:41,Langkawi,BUS410,1.020459,115.741025,-1.507302,-0.151443,0.342125,Driver was friendly,1.0,0.0,0.0,1.0,0.0,-1.407808,0.590178,-1.459063
737,2023-01-06,03:59,Penang,TRAIN74,5.073996,112.491639,1.339448,-1.198890,-0.799480,Cleanliness was lacking,0.0,0.0,1.0,1.0,0.0,1.569121,-1.127346,1.557696
740,2023-01-06,17:54,Kuala Lumpur,BUS761,3.831550,107.158813,-0.604674,-1.548039,-0.656779,Service was excellent,0.0,0.0,1.0,1.0,0.0,-0.550605,-1.446191,-0.137934
660,2022-03-13,21:03,Ipoh,BUS215,6.132260,106.856903,1.270015,1.245152,0.912927,Cleanliness was lacking,0.0,0.0,1.0,1.0,0.0,0.809255,1.402994,0.574497
411,2023-03-21,18:12,Malacca,BUS253,2.197070,100.189536,1.617179,-0.151443,0.627526,Driver was friendly,1.0,0.0,0.0,0.0,1.0,1.181298,0.593977,1.033430
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
408,2022-03-05,21:01,Georgetown,TRAIN48,2.179027,116.634093,-0.604674,-1.198890,-1.156231,Service was excellent,0.0,1.0,0.0,0.0,1.0,-0.497819,1.513135,-0.006811
332,2022-01-05,20:42,Port Dickson,BUS960,3.327090,100.736723,1.478313,-0.849741,-1.584333,Station was well-maintained,0.0,0.0,1.0,0.0,1.0,2.292281,1.527118,1.956417
208,2023-10-31,03:07,Langkawi,TRAIN82,3.667439,110.170254,1.408880,-0.849741,-0.157327,Station was well-maintained,0.0,0.0,1.0,0.0,1.0,1.315292,-1.565947,1.321009
613,2023-11-30,04:10,Cameron Highlands,BUS186,1.059309,111.420097,-1.576735,-0.500592,1.697779,Station was well-maintained,0.0,0.0,1.0,0.0,1.0,-1.374760,0.215565,-2.164329
