In [3]:
from sklearn.linear_model import LinearRegression
import numpy as np

# Dataset: Pounds in 1000s (x) and MPG (y)
X = np.array([3.5, 3.69, 3.44, 3.43, 4.34, 4.42, 2.37]).reshape(-1, 1)
y = np.array([18, 15, 18, 16, 15, 14, 24])

# Create a linear regression model
model = LinearRegression()

# Fit the model to the data
model.fit(X, y)

# Get the weight (w1) and bias (b)
w1 = model.coef_[0]
b = model.intercept_

print(f"Weight (w1): {w1}")
print(f"Bias (b): {b}")


Weight (w1): -4.569109499755582
Bias (b): 33.585124042691874


In [2]:
#@title Code - Load dependencies

#general
import io

# data
import numpy as np
import pandas as pd

# machine learning
import keras

# data visualization
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import seaborn as sns

# @title
chicago_taxi_dataset = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/chicago_taxi_train.csv")

training_df = chicago_taxi_dataset[['TRIP_MILES', 'TRIP_SECONDS', 'FARE', 'COMPANY', 'PAYMENT_TYPE', 'TIP_RATE']]


# What is the maximum fare?
max_fare = training_df['FARE'].max()
print("What is the maximum fare? \t\t\t\tAnswer: ${fare:.2f}".format(fare = max_fare))

# What is the mean distance across all trips?
mean_distance= training_df["TRIP_MILES"].mean()
print("What is the mean distance across all trips? \t\t\t\tAnswer: ${mean:.2f}".format(mean = mean_distance))

# How many cab companies are in the dataset?
company_no= training_df["COMPANY"].nunique()
print("How many cab companies are in the dataset? \tAnswer: ${company:.2f}".format(company = company_no))
training_df.head(200)

# What is the most frequent payment type?
most_freq_payment_type = training_df['PAYMENT_TYPE'].value_counts().idxmax()
print("What is the most frequent payment type? \t\tAnswer: {type}".format(type = most_freq_payment_type))

# Are any features missing data?
missing_values = training_df.isnull().sum().sum()
print("Are any features missing data? \t\t\t\tAnswer:", "No" if missing_values == 0 else "Yes")

#correlation result
training_df.corr(numeric_only = True)
#Correlation measures the strength and direction of the linear relationship between two variables.
# The result is a value between -1 and 1:
# A correlation of 1 means a perfect positive relationship.
# A correlation of -1 means a perfect negative relationship.
# A correlation of 0 means no linear relationship.
# For example:

# If x1 and x2 have a correlation of 0.8, it indicates that they are positively correlated, meaning that as x1 increases, x2 tends to increase as well.
# If x1 and x2 have a correlation of -0.5, it means they are negatively correlated, so as x1 increases, x2 tends to decrease.
print(training_df.shape)  # This will give you the number of rows and columns

What is the maximum fare? 				Answer: $159.25
What is the mean distance across all trips? 				Answer: $8.29
How many cab companies are in the dataset? 	Answer: $31.00
What is the most frequent payment type? 		Answer: Credit Card
Are any features missing data? 				Answer: No
(31694, 6)


In [3]:
#@title Code - Define ML functions
from sklearn.preprocessing import StandardScaler

def build_model(my_learning_rate, num_features):
  """Create and compile a simple linear regression model."""

  print("Building the model")
  # Most simple keras models are sequential.
  model = keras.models.Sequential()

  # Describe the topography of the model.
  # The topography of a simple linear regression model
  # is a single node in a single layer.
  model.add(keras.layers.Dense(units=1,
                                  input_shape=(num_features,)))

  # Compile the model topography into code that Keras can efficiently
  # execute. Configure training to minimize the model's mean squared error.
  model.compile(optimizer=keras.optimizers.RMSprop(learning_rate=my_learning_rate),
                loss="mean_squared_error",
                metrics=[keras.metrics.RootMeanSquaredError()])
  
  print("Model built")

  return model


def train_model(model, df, features, label, epochs, batch_size):
  """Train the model by feeding it data."""
  print("Training the model")
  # Feed the model the feature and the label.
  # The model will train for the specified number of epochs.
  # input_x = df.iloc[:,1:3].values
  # df[feature]
  callback = keras.callbacks.EarlyStopping(monitor='loss', patience=3)
  #Trains the model for a fixed number of epochs (dataset iterations).
  history = model.fit(x=features,
                      y=label,
                      batch_size=batch_size,
                      epochs=epochs,
                      callbacks=[callback], verbose=True)

  # Gather the trained model's weight and bias.
  trained_weight = model.get_weights()[0]
  trained_bias = model.get_weights()[1]

  # The list of epochs is stored separately from the rest of history.
  epochs = history.epoch

  # Isolate the error for each epoch.
  hist = pd.DataFrame(history.history)

  # To track the progression of training, we're going to take a snapshot
  # of the model's root mean squared error at each epoch.
  rmse = hist["root_mean_squared_error"]

  return trained_weight, trained_bias, epochs, rmse


def run_experiment(df, feature_names, label_name, learning_rate, epochs, batch_size):

  print('INFO: starting training experiment with features={} and label={}\n'.format(feature_names, label_name))

  num_features = len(feature_names)

  features = df.loc[:, feature_names].values
  label = df[label_name].values

  model = build_model(learning_rate, num_features)
  model_output = train_model(model, df, features, label, epochs, batch_size)

  print('\nSUCCESS: training experiment complete\n')
  print('{}'.format(model_info(feature_names, label_name, model_output)))
  make_plots(df, feature_names, label_name, model_output)

  return model

print("SUCCESS: defining linear regression functions complete.")
#@title Code - Experiment 1
# The following variables are the hyperparameters.

training_df = training_df.sample(frac=0.1, random_state=42)  # Use 10% of the data
learning_rate = 0.01
epochs = 5
batch_size = 1

# Specify the feature and the label.
features = ['TRIP_MILES']

scaler = StandardScaler()
training_df[features] = scaler.fit_transform(training_df[features])
label = 'FARE'
print("training starting now...")
model_1 = run_experiment(training_df, features, label, learning_rate, epochs, batch_size)

SUCCESS: defining linear regression functions complete.
training starting now...
INFO: starting training experiment with features=['TRIP_MILES'] and label=FARE

Building the model


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2024-09-17 11:23:59.625705: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3
2024-09-17 11:23:59.625724: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-09-17 11:23:59.625728: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-09-17 11:23:59.625742: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-09-17 11:23:59.625752: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Num GPUs Available:  1
