In [1]:
import pandas as pd
# Load dataset
data = pd.read_csv('pandas_missing_values_dataset.csv')
print(data.head())

   subject_id   q1              q2     q3 q4
0        1001  7.5           Agree   True  5
1        1002  4.0        Disagree  False  8
2        1003    -               -      -  -
3        1004  7.0  Strongly Agree  False  ?
4        1005    -        Disagree  False  4


In [2]:
import numpy as np
# Replace non-numeric placeholders with NaN
data.replace({'-': np.nan, '?': np.nan}, inplace=True)

# Convert q1 and q4 to numeric
data['q1'] = data['q1'].astype(float)
data['q4'] = data['q4'].astype(float)

# Fill missing values for numerical columns with the mean
data['q1'].fillna(data['q1'].mean(), inplace=True)
data['q4'].fillna(data['q4'].mean(), inplace=True)

# Fill missing values for categorical columns with the mode
data['q2'].fillna(data['q2'].mode()[0], inplace=True)
data['q3'].fillna(data['q3'].mode()[0], inplace=True)

print(data)

from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# One-hot encoding for q2
data = pd.get_dummies(data, columns=['q2'], drop_first=True)

# Label encoding for q3
label_encoder = LabelEncoder()
data['q3'] = label_encoder.fit_transform(data['q3'])

print(data)

from sklearn.preprocessing import StandardScaler

# Define numerical features
numerical_features = ['q1', 'q4']

# Standard scaling (mean=0, std=1)
scaler = StandardScaler()
data[numerical_features] = scaler.fit_transform(data[numerical_features])

print(data)

# Create interaction feature
data['q1_q4_interaction'] = data['q1'] * data['q4']

print(data)

from sklearn.model_selection import train_test_split

# Define feature columns and target column (assuming we are predicting q3)
X = data.drop(['subject_id', 'q3'], axis=1)
y = data['q3']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.head())
print(y_train.head())

    subject_id         q1              q2     q3     q4
0         1001   7.500000           Agree   True   5.00
1         1002   4.000000        Disagree  False   8.00
2         1003   8.653846           Agree  False   6.75
3         1004   7.000000  Strongly Agree  False   6.75
4         1005   8.653846        Disagree  False   4.00
5         1006   5.500000         Neutral   True   8.00
6         1007   8.000000           Agree  False   7.00
7         1008  28.000000           Agree  False   9.00
8         1009   8.653846           Agree  False  12.00
9         1010   8.653846           Agree  False   6.75
10        1011   8.653846  Strongly Agree  False   1.00
11        1012   8.653846        Disagree   True   9.00
12        1013   6.500000           Agree   True   3.00
13        1014   8.000000        Disagree  False   6.75
14        1015   8.500000         Neutral      2  10.00
15        1016   8.653846           Agree  False   5.00
16        1017   9.000000           Agree   True