In [2]:
# Separate features and target variable
X = data.drop('TAXI_OUT', axis=1)
y = data['TAXI_OUT']

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Define preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])

# 4. Build the Model
# Combine preprocessing with classifier into a pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', LinearRegression())])

# Perform cross-validation
scores = cross_val_score(model, X, y, cv=5)

print(f'Cross-validation scores: {scores}')
print(f'Average cross-validation score: {scores.mean()}')

# Train the model
model.fit(X, y)

# 5. Evaluate the Model
# Make predictions
predictions = model.predict(X)

# Calculate the mean squared error of the predictions
mse = mean_squared_error(y, predictions)

print(f'Mean Squared Error: {mse}')

# Plot the true values vs predictions
plt.scatter(y, predictions)
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.title('True Values vs Predictions')
plt.grid(True)
plt.show()

ModuleNotFoundError: No module named 'keras'