## Install required libraries:

In [1]:
!pip install ray[default] google-cloud-bigquery scikit-learn
!pip install google-cloud-storage
!pip install google-auth
!pip install db-dtypes
!pip install 'google-cloud-bigquery[pandas]'
!pip install tensorflow

Collecting ray[default]
  Using cached ray-2.7.0-cp310-cp310-manylinux2014_x86_64.whl (62.5 MB)
Collecting google-cloud-bigquery
  Using cached google_cloud_bigquery-3.11.4-py2.py3-none-any.whl (219 kB)
Collecting filelock (from ray[default])
  Using cached filelock-3.12.4-py3-none-any.whl (11 kB)
Collecting aiosignal (from ray[default])
  Using cached aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
Collecting frozenlist (from ray[default])
  Using cached frozenlist-1.4.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (225 kB)
Collecting aiohttp-cors (from ray[default])
  Using cached aiohttp_cors-0.7.0-py3-none-any.whl (27 kB)
Collecting colorful (from ray[default])
  Using cached colorful-0.5.5-py2.py3-none-any.whl (201 kB)
Collecting py-spy>=0.2.0 (from ray[default])
  Using cached py_spy-0.3.14-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (3.0 MB)
Collecting gpustat>=1.0.0 (from ray[default])
  Using cached gpustat-1.1.1-py3-n

## Now do the imports:

In [2]:
import logging
import ray
from google.cloud import bigquery
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from google.cloud import storage
from google.cloud import bigquery
from google.oauth2 import service_account
import db_dtypes

## Set the variables that will be used throughout the notebook

In [3]:
project_id = 'alexbu-gke-dev-c'

## Who am I?

In [4]:
import google.auth

credentials, project = google.auth.default()
if hasattr(credentials, 'service_account_email'):
    user_email = credentials.service_account_email
elif hasattr(credentials, 'quota_project_id'):
    user_email = credentials.quota_project_id
else:
    user_email = "Unable to determine user email."

print(f"Authenticated as: {user_email}")


Authenticated as: default


## Now get the key and load dataset

Note that this requires default account to have permissions:

```
PROJECT_ID=alexbu-gke-dev-c
PROJECT_NUMBER=$(gcloud projects describe $PROJECT_ID --format="value(projectNumber)")
SERVICE_ACCOUNT_EMAIL="${PROJECT_NUMBER}-compute@developer.gserviceaccount.com"
gcloud projects add-iam-policy-binding $PROJECT_ID --member="serviceAccount:${SERVICE_ACCOUNT_EMAIL}" --role="roles/bigquery.jobUser"
```



In [5]:
# Specify your bucket name, source blob name (path in GCS), and a local destination path
bucket_name = 'yojowa-market-public'
source_blob_name = 'y_market_sa.json'
destination_file_name = 'y_market_sa.json'

In [6]:
def download_key_from_gcs(bucket_name, source_blob_name, destination_file_name):
    """Downloads a blob from the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)
    print(f"Downloaded key from GCS bucket {bucket_name} to {destination_file_name}")





# only need be done once
# download_key_from_gcs(bucket_name, source_blob_name, destination_file_name)


In [7]:
# # Set up BigQuery client
# key_path=destination_file_name
# credentials = service_account.Credentials.from_service_account_file(key_path)
client = bigquery.Client(project=project_id)

# Load the dataset from BigQuery
query = "SELECT * FROM `yojowa-market-explorations.yojowa_market_dataset_01.y_table_1`"
df = client.query(query).to_dataframe()

# show what you loaded:

# Enumerate all columns
print("Columns:")
for idx, column in enumerate(df.columns, 1):
    print(f"{idx}. {column}")

# List row count
print(f"\nTotal Rows: {len(df)}")

# List 2 sample rows
print("\nSample Rows:")
print(df.sample(2))


Columns:
1. symbol
2. rel_diff
3. pe_ptile
4. pb_ptile
5. margin_ptile
6. rev_e_ptile
7. inst_own_ptile
8. eps_growth_ttm_ptile
9. eps_prev_qtr_vs_prev_yr_qtr_ptile
10. proj_eps_growth_curr_qtr_ptile
11. proj_eps_growth_curr_yr_next_yr_ptile
12. ret_eq_ttm_ptile
13. ttl_dept_eq_ttm_ptile
14. industry
15. sector

Total Rows: 128216

Sample Rows:
      symbol  rel_diff  pe_ptile  pb_ptile  margin_ptile  rev_e_ptile  \
83696   EMKR -0.407420       NaN       0.1         0.094        0.537   
2958    MNTK -0.583408       0.8       1.0         0.733        0.429   

       inst_own_ptile  eps_growth_ttm_ptile  \
83696           0.510                   NaN   
2958            0.267                   NaN   

       eps_prev_qtr_vs_prev_yr_qtr_ptile  proj_eps_growth_curr_qtr_ptile  \
83696                                NaN                             NaN   
2958                                 NaN                            0.75   

       proj_eps_growth_curr_yr_next_yr_ptile  ret_eq_ttm_ptile

In [8]:
# Initialize Ray

# Set Ray's logging level to WARNING to suppress INFO messages
logging.getLogger('ray').setLevel(logging.WARNING)

ray.init()

2023-09-26 22:59:07,321	INFO worker.py:1633 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


0,1
Python version:,3.10.11
Ray version:,2.7.0
Dashboard:,http://127.0.0.1:8265


In [9]:
df.fillna(0.5, inplace=True)
df1 = df.drop(columns=['symbol', 'sector'])
y = df1['rel_diff']
X = df1.drop(columns=['rel_diff'])

# Define the training function with Ray's remote decorator
```
@ray.remote
def train_model(X_train, y_train):
    model = LinearRegression()
    model.fit(X_train, y_train)
    return model
```

# Train a separate model for each industry
```
models = {}

for industry, group in X.groupby('industry'):
    # Separate the target variable for this group
    y_group = y[group.index]
    X_group = group.drop(columns=['industry'])

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, test_size=0.2, random_state=42)

    # Train the model for this industry
    model_id = train_model.remote(X_train, y_train)
    model = ray.get(model_id)

    # Evaluate the model using the test set (optional)
    predictions = model.predict(X_test)

    # Store the model, test data, and predictions in the dictionary
    models[industry] = {
        'model': model,
        'X_test': X_test,
        'y_test': y_test,
        'predictions': predictions
    }
```



In [15]:
# train using deep learning

import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import MinMaxScaler
import os
import sys
import numpy as np


os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Suppress INFO logs
tf.get_logger().setLevel('ERROR')  # Only show ERROR logs

class SuppressStdout:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

# Custom Keras callback to capture metrics progression
class MetricsHistory(keras.callbacks.Callback):
    def on_train_begin(self, logs=None):
        self.history = {'loss': [], 'val_loss': []}

    def on_epoch_end(self, epoch, logs=None):
        self.history['loss'].append(logs['loss'])
        self.history['val_loss'].append(logs['val_loss'])

# Normalize the data
scaler = MinMaxScaler()

# Drop the 'industry' column before scaling
X_numeric = X.drop(columns=['industry'])

# Normalize the data
scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_numeric), columns=X_numeric.columns)

# Add the 'industry' column back to the scaled data
X_scaled['industry'] = X['industry'].values

# Define the neural network model
def create_nn_model(input_shape):
    model = keras.Sequential([
        keras.layers.Dense(128, activation='relu', input_shape=(input_shape,)),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dense(1)  # Regression problem, so no activation in the output layer
    ])
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Define the training function with Ray's remote decorator
@ray.remote
def train_nn_model(industry, X_train, y_train, input_shape):
    nn_model = create_nn_model(input_shape)
    
    metrics_callback = MetricsHistory()
    nn_model.fit(X_train, y_train, epochs=80, batch_size=32, validation_split=0.1, verbose=2, callbacks=[metrics_callback])

    # Summarize the metrics progression in a single line
    summary = {
        'Industry': industry,
        'Final Loss': metrics_callback.history['loss'][-1],
        'Final Val Loss': metrics_callback.history['val_loss'][-1],
        'Loss Progression': f"{np.mean(metrics_callback.history['loss']):.4f} -> {metrics_callback.history['loss'][-1]:.4f}"
    }
    
    return nn_model, summary

# Train a separate model for each industry
models = {}
futures = []



for industry, group in X_scaled.groupby('industry'):
    
    # Separate the target variable for this group
    y_group = y[group.index]
    X_group = group.drop(columns=['industry'])

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_group, y_group, test_size=0.2, random_state=42)

    
    with SuppressStdout():
        # Train the neural network for this industry using Ray
        future = train_nn_model.remote(industry, X_train, y_train, X_train.shape[1])
        futures.append((industry, future, X_test, y_test))
        
    
# DataFrame to capture the summaries
summaries_df = pd.DataFrame(columns=['Industry', 'Final Loss', 'Final Val Loss', 'Loss Progression'])

for industry, future, X_test, y_test in futures:
    nn_model, summary = ray.get(future)

    # Append the summary to the DataFrame
    summaries_df.loc[len(summaries_df)] = summary
    
    # Evaluate the model using the test set (optional)
    predictions = nn_model.predict(X_test).flatten()

    # Store the model, test data, and predictions in the dictionary
    models[industry] = {
        'model': nn_model,
        'X_test': X_test,
        'y_test': y_test,
        'predictions': predictions
    }

display (summaries_df)


[2m[36m(train_nn_model pid=366)[0m Epoch 1/80
[2m[36m(train_nn_model pid=355)[0m 28/28 - 1s - loss: 0.0926 - val_loss: 0.0961 - 1s/epoch - 38ms/step
Keras model archive loading:
File Name                                             Modified             Size
config.json                                    2023-09-26 23:13:44         1826
variables.h5                                   2023-09-26 23:13:44       139584
metadata.json                                  2023-09-26 23:13:44           64
[2m[36m(train_nn_model pid=363)[0m Epoch 31/80[32m [repeated 590x across cluster][0m
[2m[36m(train_nn_model pid=370)[0m Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
[2m[36m(train_nn_model pid=370)[0m ...layers
[2m[36m(train_nn_model pid=370)[0m ......dense
[2m[36m(train_nn_model pid=370)[0m .........vars
[2m[36m(train_nn_model pid=370)[0m ............0
[2m[36m(train_nn_model pid=370)[0m ............1
[2m[36m(train_nn_model pid=370)[0m ......dens

Unnamed: 0,Industry,Final Loss,Final Val Loss,Loss Progression
0,0.5,0.003149,0.476196,0.0106 -> 0.0031
1,Aerospace & Defense,0.015780,0.039479,0.0307 -> 0.0158
2,Air Freight & Logistics,0.007724,0.030063,0.0141 -> 0.0077
3,Airlines,0.056137,0.068434,0.0792 -> 0.0561
4,Auto Components,0.021090,0.044156,0.0422 -> 0.0211
...,...,...,...,...
66,Trading Companies & Distributors,0.018025,0.025641,0.0304 -> 0.0180
67,Transportation Infrastructure,0.004722,0.008508,0.0216 -> 0.0047
68,Water Utilities,0.007916,0.042737,0.0202 -> 0.0079
69,Wireless Telecommunication Services,0.030256,0.073871,0.1100 -> 0.0303


In [16]:
# Evaluate each model and print its accuracy metrics
results = pd.DataFrame(columns=['Industry', 'Train Count', 'Test Count', 'MAE', 'MSE', 'R^2', 'Assessment'])

for industry, data in models.items():
    mae = mean_absolute_error(data['y_test'], data['predictions'])
    mse = mean_squared_error(data['y_test'], data['predictions'])
    r2 = r2_score(data['y_test'], data['predictions'])
    train_count = len(data['y_test']) + len(data['predictions'])
    test_count = len(data['predictions'])
    assessment = 'Trustworthy' if r2 > 0.7 else 'Not Trustworthy'
    
    new_row = {
        'Industry': industry,
        'Train Count': train_count,
        'Test Count': test_count,
        'MAE': mae,
        'MSE': mse,
        'R^2': r2,
        'Assessment': assessment
    }
    
    results.loc[len(results)] = new_row

sorted_results = results.sort_values(by='R^2', ascending=False)

# Display the sorted results
display(sorted_results)

Unnamed: 0,Industry,Train Count,Test Count,MAE,MSE,R^2,Assessment
65,Tobacco,94,47,0.123893,0.023658,0.855261,Trustworthy
69,Wireless Telecommunication Services,204,102,0.208926,0.107328,0.818047,Trustworthy
9,Building Products,406,203,0.105951,0.020998,0.776595,Trustworthy
14,Construction & Engineering,372,186,0.123875,0.029656,0.760364,Trustworthy
36,Household Products,140,70,0.063439,0.007505,0.728465,Trustworthy
...,...,...,...,...,...,...,...
68,Water Utilities,150,75,0.133379,0.048658,0.124143,Not Trustworthy
15,Construction Materials,140,70,0.132915,0.044533,0.039801,Not Trustworthy
70,unknown,1150,575,0.197148,0.124153,0.013142,Not Trustworthy
67,Transportation Infrastructure,52,26,0.143823,0.039023,-0.203710,Not Trustworthy


## Prediction Function
```
def predict_rel_diff(input_data):
    # Ensure input_data doesn't have the 'rel_diff' column
    if 'rel_diff' in input_data.columns:
        input_data = input_data.drop(columns=['rel_diff'])
    
    predictions = model.predict(input_data)
    return predictions

# Example usage:
# new_data = pd.read_csv("path_to_new_data.csv")
# predicted_rel_diff = predict_rel_diff(new_data)
# print(predicted_rel_diff)
```

* Is this using linear multi-variable regression?
  * Linear Regression: The model used is LinearRegression() from scikit-learn, which performs linear regression.
  * Multi-variable: Since the dataset contains multiple feature columns (all columns except rel_diff), the regression is multi-variable. The model is trained to predict the target variable rel_diff based on multiple input features.

## Can I use deep learning in this scenario?
Yes, you can use deep learning for regression tasks like this one. Deep learning models, especially feedforward neural networks (often just called "neural networks"), can be used for regression tasks where the output is a continuous value, such as predicting rel_diff in your scenario.

Here's a basic outline of how you can approach this using TensorFlow and Keras:

Setup:
First, set up your environment and import necessary libraries.

```python
!pip install tensorflow

import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
```
### Data Preprocessing:
Neural networks often benefit from feature scaling. Standardizing the input features so they have a mean of 0 and a standard deviation of 1 can help in training.

```python
# Splitting the dataset into training and test sets
X = df.drop(columns=['rel_diff'])
y = df['rel_diff']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
```


### Model Building:
Define a simple feedforward neural network.

```python
model = keras.Sequential([
    keras.layers.Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(1)  # Single output neuron for regression
])

model.compile(optimizer='adam', loss='mean_squared_error')
```

### Model Training:
Train the neural network.

```python
model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.2)
```

### Evaluation and Prediction:
Evaluate the model on the test set and make predictions.

```python
loss = model.evaluate(X_test_scaled, y_test)
print(f"Mean Squared Error on Test Set: {loss}")

# Predicting rel_diff values
predicted_rel_diff = model.predict(X_test_scaled)
```
