
## Regression Models for Future Values
This section includes code for training a polynomial regression machine learning model based on historical data to predict various factors. (as elaborated in our neural network/random forest training code)

Polynomial regression is great when you’re dealing with data that doesn’t follow a straight-line pattern but has more of a curve to it. (our historic information!!!!!) It’s helpful for capturing non-linear relationships, like when the relationship between variables bends or curves rather than being linear. For instance, it’s useful in areas like modeling population growth, stock prices, or any situation where things grow at different rates over time. By adding powers of your features (like squared or cubic terms), it helps improve accuracy for complex patterns. But there’s a catch—using too high of a polynomial can lead to overfitting, meaning the model can get too detailed and fit random noise in your data rather than the actual trend. It’s flexible and often works well in practice, but you have to watch out for that overfitting risk. I decided to use a polynomial factor of 5, so that it would be of high accuracy, without overfitting to our data set.


In [None]:
# Import necessary libraries for data manipulation, machine learning, and visualization
import numpy as np  # For numerical operations
import pandas as pd  # For data manipulation and analysis
from sklearn.linear_model import LinearRegression  # For linear regression modeling
from sklearn.preprocessing import RobustScaler, PolynomialFeatures  # For data scaling and feature engineering
from sklearn.model_selection import TimeSeriesSplit  # For time series cross-validation
from sklearn.pipeline import make_pipeline  # For creating a sequence of data transformations
import matplotlib.pyplot as plt  # For creating visualizations

# Define a function to prepare the data for analysis
def prepare_data(df, tickers, min_data_points=5):
    # Initialize an empty dictionary to store processed data
    data_dict = {}
    
    # Iterate through each stock ticker
    for ticker in tickers:
        # Filter the dataframe for the current ticker and sort by year
        df_ticker = df[df['Ticker'] == ticker]
        df_ticker = df_ticker[['Year', 'Revenue']].sort_values('Year')

        # Check if there's sufficient data for analysis
        if len(df_ticker) < min_data_points:
            print(f"Skipping {ticker} due to insufficient data points ({len(df_ticker)}).")
            continue

        # Scale the revenue data to reduce the impact of outliers
        scaler = RobustScaler()
        scaled_data = scaler.fit_transform(df_ticker[['Revenue']])

        # Prepare input (X) and output (y) data
        X = df_ticker['Year'].values.reshape(-1, 1)
        y = scaled_data.reshape(-1)

        # Check for any invalid data
        if np.isnan(y).any():
            print(f"Skipping {ticker} due to NaN values in target variable y.")
            continue

        # Store the processed data in the dictionary
        data_dict[ticker] = {
            'X': X,
            'y': y,
            'scaler': scaler,
            'years': df_ticker['Year'].values
        }

    return data_dict

# Define a function to evaluate the model using cross-validation
def evaluate_with_cross_validation(data_dict, tickers, degree=2, max_splits=5):
    cv_results = []

    # Iterate through each stock ticker
    for ticker in tickers:
        if ticker not in data_dict:  # Skip tickers that were filtered out
            continue

        X = data_dict[ticker]['X']
        y = data_dict[ticker]['y']

        # Determine the number of splits for cross-validation
        n_splits = min(max_splits, len(X) - 1)
        if n_splits < 2:
            print(f"Skipping cross-validation for {ticker} due to insufficient data points for splitting.")
            continue

        # Perform time series cross-validation
        tscv = TimeSeriesSplit(n_splits=n_splits)
        ticker_cv_results = []

        for train_index, test_index in tscv.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            # Create and train the model
            model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
            model.fit(X_train, y_train)

            # Evaluate the model's performance
            test_loss = np.mean((model.predict(X_test) - y_test) ** 2)
            print(f"Test Loss for {ticker} fold: {test_loss}")
            ticker_cv_results.append(test_loss)

        # Calculate and store the average cross-validation loss
        avg_cv_loss = np.mean(ticker_cv_results)
        cv_results.append({
            'Ticker': ticker,
            'CV_Loss': avg_cv_loss
        })

    return cv_results

# Load the financial data from a CSV file
df = pd.read_csv('historical_financial_data_eodhdsecondversion.csv') #our previous historic data! Now, we want to make predictions off of that in this model!

# Get the list of unique stock tickers
tickers = df['Ticker'].unique()

# Prepare the data for analysis
data_dict = prepare_data(df, tickers, min_data_points=5)

# Set the complexity of the polynomial model
poly_degree = 5

# Evaluate the model using cross-validation
cv_results = evaluate_with_cross_validation(data_dict, tickers, degree=poly_degree)

# Initialize an empty list to store prediction results
results = []

# Iterate through each stock ticker
for ticker in tickers:
    if ticker not in data_dict:  # Skip tickers that were filtered out
        continue
    
    data = data_dict[ticker]
    X, y, scaler = data['X'], data['y'], data['scaler']

    # Create and train the model
    model = make_pipeline(PolynomialFeatures(poly_degree), LinearRegression())
    model.fit(X, y)

    # Generate predictions for the next 3 years
    last_year = X[-1, 0]
    future_years = np.array(range(last_year + 1, last_year + 4)).reshape(-1, 1)
    predictions = model.predict(future_years)

    # Convert predictions back to original scale
    predictions_scaled = scaler.inverse_transform(predictions.reshape(-1, 1)).flatten()
    print(f"Predictions for {ticker}:", predictions_scaled)

    # Store the results
    results.append({
        'Ticker': ticker,
        'Predictions': predictions_scaled.tolist()
    })

    # Visualize the actual vs predicted values
    actual_values = scaler.inverse_transform(y.reshape(-1, 1)).flatten()
    years = data['years']

    plt.figure(figsize=(12, 6))
    plt.scatter(years, actual_values, label='Actual Values', color='blue')
    plt.plot(years, scaler.inverse_transform(model.predict(X).reshape(-1, 1)).flatten(), 
             label='Fitted Curve', color='red')
    plt.plot(future_years, predictions_scaled, label='Predictions', linestyle='--', color='green')
    plt.xlabel('Year')
    plt.ylabel('Revenue')
    plt.title(f'Actual vs Predicted Revenue for {ticker} (Polynomial Degree: {poly_degree})')
    plt.legend()
    plt.show()

# Convert results to DataFrames for easy handling
results_df = pd.DataFrame(results)
results_cv_df = pd.DataFrame(cv_results)

# Save the results to CSV files
results_df.to_csv('revenue_model_predictions_polynomial_regression3.csv', index=False)

print("Results saved to revenue_model_predictions_polynomial_regression2.csv")

In [None]:
#predictions for EBITDA - same as above
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import RobustScaler, PolynomialFeatures
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt

def prepare_data(df, tickers, min_data_points=5):
    data_dict = {}
    for ticker in tickers:
        df_ticker = df[df['Ticker'] == ticker]
        df_ticker = df_ticker[['Year', 'EBITDA']].sort_values('Year')

        # Check if the ticker has at least 'min_data_points' data points
        if len(df_ticker) < min_data_points:
            print(f"Skipping {ticker} due to insufficient data points ({len(df_ticker)}).")
            continue

        scaler = RobustScaler()
        scaled_data = scaler.fit_transform(df_ticker[['EBITDA']])

        X = df_ticker['Year'].values.reshape(-1, 1)
        y = scaled_data.reshape(-1)

        # Check for NaN values in y and skip the ticker if found
        if np.isnan(y).any():
            print(f"Skipping {ticker} due to NaN values in target variable y.")
            continue

        data_dict[ticker] = {
            'X': X,
            'y': y,
            'scaler': scaler,
            'years': df_ticker['Year'].values
        }

    return data_dict

def evaluate_with_cross_validation(data_dict, tickers, degree=2, max_splits=5):
    cv_results = []

    for ticker in tickers:
        if ticker not in data_dict:  # Skip tickers that were filtered out
            continue

        X = data_dict[ticker]['X']
        y = data_dict[ticker]['y']

        # Adjust n_splits to be less than the number of data points
        n_splits = min(max_splits, len(X) - 1)
        if n_splits < 2:
            print(f"Skipping cross-validation for {ticker} due to insufficient data points for splitting.")
            continue

        tscv = TimeSeriesSplit(n_splits=n_splits)
        ticker_cv_results = []

        for train_index, test_index in tscv.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
            model.fit(X_train, y_train)

            test_loss = np.mean((model.predict(X_test) - y_test) ** 2)
            print(f"Test Loss for {ticker} fold: {test_loss}")
            ticker_cv_results.append(test_loss)

        avg_cv_loss = np.mean(ticker_cv_results)
        cv_results.append({
            'Ticker': ticker,
            'CV_Loss': avg_cv_loss
        })

    return cv_results

# Load the data
df = pd.read_csv('historical_financial_data_eodhdsecondversion.csv')

# List of tickers to include
tickers = df['Ticker'].unique()

# Prepare data with a minimum data points threshold
data_dict = prepare_data(df, tickers, min_data_points=5)

# Set the degree of the polynomial 
poly_degree = 5

# Evaluate with cross-validation
cv_results = evaluate_with_cross_validation(data_dict, tickers, degree=poly_degree)

# Initialize an empty list to store results
results = []

# Iterate over each ticker
for ticker in tickers:
    if ticker not in data_dict:  # Skip tickers that were filtered out
        continue
    
    data = data_dict[ticker]
    X, y, scaler = data['X'], data['y'], data['scaler']

    # Build and fit the model
    model = make_pipeline(PolynomialFeatures(poly_degree), LinearRegression())
    model.fit(X, y)

    # Make predictions for the next 5 years
    last_year = X[-1, 0]
    future_years = np.array(range(last_year + 1, last_year + 4)).reshape(-1, 1)
    predictions = model.predict(future_years)

    # Inverse transform predictions
    predictions_scaled = scaler.inverse_transform(predictions.reshape(-1, 1)).flatten()
    print(f"Predictions for {ticker}:", predictions_scaled)

    # Store the results
    results.append({
        'Ticker': ticker,
        'Predictions': predictions_scaled.tolist()
    })

    # Plotting the actual vs predicted values
    actual_values = scaler.inverse_transform(y.reshape(-1, 1)).flatten()
    years = data['years']

    plt.figure(figsize=(12, 6))
    plt.scatter(years, actual_values, label='Actual Values', color='blue')
    plt.plot(years, scaler.inverse_transform(model.predict(X).reshape(-1, 1)).flatten(), 
             label='Fitted Curve', color='red')
    plt.plot(future_years, predictions_scaled, label='Predictions', linestyle='--', color='green')
    plt.xlabel('Year')
    plt.ylabel('Revenue')
    plt.title(f'Actual vs Predicted Revenue for {ticker} (Polynomial Degree: {poly_degree})')
    plt.legend()
    plt.show()

# Convert results to DataFrames
results_df = pd.DataFrame(results)
results_cv_df = pd.DataFrame(cv_results)

# Save to CSV
results_df.to_csv('ebitda_model_predictions.csv', index=False)

import ast
df = pd.read_csv('ebitda_model_predictions.csv')

# Convert the Predictions column from string representation of lists to actual lists
df['Predictions'] = df['Predictions'].apply(ast.literal_eval)

# Create a list to store the new rows for the long-format DataFrame
long_format_data = []

# Loop through each row in the original DataFrame
for _, row in df.iterrows():
    ticker = row['Ticker']  # Assuming there is a 'Ticker' column
    predictions = row['Predictions']
    
    # Loop through each prediction and create a new row for each year
    for year_idx, revenue in enumerate(predictions):
        year = 2023 + year_idx + 1  # Adjust the year as needed
        long_format_data.append([year, ticker, revenue])

# Create a new DataFrame with the long-format data
long_format_df = pd.DataFrame(long_format_data, columns=['Year', 'Ticker', 'Revenue'])

# Save the long-format DataFrame to a new CSV file
long_format_df.to_csv('ebitda_model_predictions_formatted.csv', index=False)

In [None]:
#prediction for market cap
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import RobustScaler, PolynomialFeatures
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt

def prepare_data(df, tickers, min_data_points=5):
    data_dict = {}
    for ticker in tickers:
        df_ticker = df[df['Ticker'] == ticker]
        df_ticker = df_ticker[['Year', 'Market Capitalization']].sort_values('Year')

        # Check if the ticker has at least 'min_data_points' data points
        if len(df_ticker) < min_data_points:
            print(f"Skipping {ticker} due to insufficient data points ({len(df_ticker)}).")
            continue

        scaler = RobustScaler()
        scaled_data = scaler.fit_transform(df_ticker[['Market Capitalization']])

        X = df_ticker['Year'].values.reshape(-1, 1)
        y = scaled_data.reshape(-1)

        # Check for NaN values in y and skip the ticker if found
        if np.isnan(y).any():
            print(f"Skipping {ticker} due to NaN values in target variable y.")
            continue

        data_dict[ticker] = {
            'X': X,
            'y': y,
            'scaler': scaler,
            'years': df_ticker['Year'].values
        }

    return data_dict

def evaluate_with_cross_validation(data_dict, tickers, degree=2, max_splits=5):
    cv_results = []

    for ticker in tickers:
        if ticker not in data_dict:  # Skip tickers that were filtered out
            continue

        X = data_dict[ticker]['X']
        y = data_dict[ticker]['y']

        # Adjust n_splits to be less than the number of data points
        n_splits = min(max_splits, len(X) - 1)
        if n_splits < 2:
            print(f"Skipping cross-validation for {ticker} due to insufficient data points for splitting.")
            continue

        tscv = TimeSeriesSplit(n_splits=n_splits)
        ticker_cv_results = []

        for train_index, test_index in tscv.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
            model.fit(X_train, y_train)

            test_loss = np.mean((model.predict(X_test) - y_test) ** 2)
            print(f"Test Loss for {ticker} fold: {test_loss}")
            ticker_cv_results.append(test_loss)

        avg_cv_loss = np.mean(ticker_cv_results)
        cv_results.append({
            'Ticker': ticker,
            'CV_Loss': avg_cv_loss
        })

    return cv_results

# Load the data
df = pd.read_csv('historical_financial_data_eodhdsecondversion.csv')

# List of tickers to include
tickers = df['Ticker'].unique()

# Prepare data with a minimum data points threshold
data_dict = prepare_data(df, tickers, min_data_points=5)

# Set the degree of the polynomial 
poly_degree = 5

# Evaluate with cross-validation
cv_results = evaluate_with_cross_validation(data_dict, tickers, degree=poly_degree)

# Initialize an empty list to store results
results = []

# Iterate over each ticker
for ticker in tickers:
    if ticker not in data_dict:  # Skip tickers that were filtered out
        continue
    
    data = data_dict[ticker]
    X, y, scaler = data['X'], data['y'], data['scaler']

    # Build and fit the model
    model = make_pipeline(PolynomialFeatures(poly_degree), LinearRegression())
    model.fit(X, y)

    # Make predictions for the next 5 years
    last_year = X[-1, 0]
    future_years = np.array(range(last_year + 1, last_year + 4)).reshape(-1, 1)
    predictions = model.predict(future_years)

    # Inverse transform predictions
    predictions_scaled = scaler.inverse_transform(predictions.reshape(-1, 1)).flatten()
    print(f"Predictions for {ticker}:", predictions_scaled)

    # Store the results
    results.append({
        'Ticker': ticker,
        'Predictions': predictions_scaled.tolist()
    })

    # Plotting the actual vs predicted values
    actual_values = scaler.inverse_transform(y.reshape(-1, 1)).flatten()
    years = data['years']

    plt.figure(figsize=(12, 6))
    plt.scatter(years, actual_values, label='Actual Values', color='blue')
    plt.plot(years, scaler.inverse_transform(model.predict(X).reshape(-1, 1)).flatten(), 
             label='Fitted Curve', color='red')
    plt.plot(future_years, predictions_scaled, label='Predictions', linestyle='--', color='green')
    plt.xlabel('Year')
    plt.ylabel('Revenue')
    plt.title(f'Actual vs Predicted Revenue for {ticker} (Polynomial Degree: {poly_degree})')
    plt.legend()
    plt.show()

# Convert results to DataFrames
results_df = pd.DataFrame(results)
results_cv_df = pd.DataFrame(cv_results)

# Save to CSV
results_df.to_csv('mc_model_predictions.csv', index=False)

import ast
df = pd.read_csv('mc_model_predictions.csv')

# Convert the Predictions column from string representation of lists to actual lists
df['Predictions'] = df['Predictions'].apply(ast.literal_eval)

# Create a list to store the new rows for the long-format DataFrame
long_format_data = []

# Loop through each row in the original DataFrame
for _, row in df.iterrows():
    ticker = row['Ticker']  # Assuming there is a 'Ticker' column
    predictions = row['Predictions']
    
    # Loop through each prediction and create a new row for each year
    for year_idx, revenue in enumerate(predictions):
        year = 2023 + year_idx + 1  # Adjust the year as needed
        long_format_data.append([year, ticker, revenue])

# Create a new DataFrame with the long-format data
long_format_df = pd.DataFrame(long_format_data, columns=['Year', 'Ticker', 'Market Capitalization'])

# Save the long-format DataFrame to a new CSV file
long_format_df.to_csv('mc_model_predictions_formatted.csv', index=False)

In [None]:
#prediction for Gross Profit
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import RobustScaler, PolynomialFeatures
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt

def prepare_data(df, tickers, min_data_points=5):
    data_dict = {}
    for ticker in tickers:
        df_ticker = df[df['Ticker'] == ticker]
        df_ticker = df_ticker[['Year', 'Gross Profit']].sort_values('Year')

        # Check if the ticker has at least 'min_data_points' data points
        if len(df_ticker) < min_data_points:
            print(f"Skipping {ticker} due to insufficient data points ({len(df_ticker)}).")
            continue

        scaler = RobustScaler()
        scaled_data = scaler.fit_transform(df_ticker[['Gross Profit']])

        X = df_ticker['Year'].values.reshape(-1, 1)
        y = scaled_data.reshape(-1)

        # Check for NaN values in y and skip the ticker if found
        if np.isnan(y).any():
            print(f"Skipping {ticker} due to NaN values in target variable y.")
            continue

        data_dict[ticker] = {
            'X': X,
            'y': y,
            'scaler': scaler,
            'years': df_ticker['Year'].values
        }

    return data_dict

def evaluate_with_cross_validation(data_dict, tickers, degree=2, max_splits=5):
    cv_results = []

    for ticker in tickers:
        if ticker not in data_dict:  # Skip tickers that were filtered out
            continue

        X = data_dict[ticker]['X']
        y = data_dict[ticker]['y']

        # Adjust n_splits to be less than the number of data points
        n_splits = min(max_splits, len(X) - 1)
        if n_splits < 2:
            print(f"Skipping cross-validation for {ticker} due to insufficient data points for splitting.")
            continue

        tscv = TimeSeriesSplit(n_splits=n_splits)
        ticker_cv_results = []

        for train_index, test_index in tscv.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
            model.fit(X_train, y_train)

            test_loss = np.mean((model.predict(X_test) - y_test) ** 2)
            print(f"Test Loss for {ticker} fold: {test_loss}")
            ticker_cv_results.append(test_loss)

        avg_cv_loss = np.mean(ticker_cv_results)
        cv_results.append({
            'Ticker': ticker,
            'CV_Loss': avg_cv_loss
        })

    return cv_results

# Load the data
df = pd.read_csv('historical_financial_data_eodhdsecondversion.csv')

# List of tickers to include
tickers = df['Ticker'].unique()

# Prepare data with a minimum data points threshold
data_dict = prepare_data(df, tickers, min_data_points=5)

# Set the degree of the polynomial 
poly_degree = 5

# Evaluate with cross-validation
cv_results = evaluate_with_cross_validation(data_dict, tickers, degree=poly_degree)

# Initialize an empty list to store results
results = []

# Iterate over each ticker
for ticker in tickers:
    if ticker not in data_dict:  # Skip tickers that were filtered out
        continue
    
    data = data_dict[ticker]
    X, y, scaler = data['X'], data['y'], data['scaler']

    # Build and fit the model
    model = make_pipeline(PolynomialFeatures(poly_degree), LinearRegression())
    model.fit(X, y)

    # Make predictions for the next 5 years
    last_year = X[-1, 0]
    future_years = np.array(range(last_year + 1, last_year + 4)).reshape(-1, 1)
    predictions = model.predict(future_years)

    # Inverse transform predictions
    predictions_scaled = scaler.inverse_transform(predictions.reshape(-1, 1)).flatten()
    print(f"Predictions for {ticker}:", predictions_scaled)

    # Store the results
    results.append({
        'Ticker': ticker,
        'Predictions': predictions_scaled.tolist()
    })

    # Plotting the actual vs predicted values
    actual_values = scaler.inverse_transform(y.reshape(-1, 1)).flatten()
    years = data['years']

    plt.figure(figsize=(12, 6))
    plt.scatter(years, actual_values, label='Actual Values', color='blue')
    plt.plot(years, scaler.inverse_transform(model.predict(X).reshape(-1, 1)).flatten(), 
             label='Fitted Curve', color='red')
    plt.plot(future_years, predictions_scaled, label='Predictions', linestyle='--', color='green')
    plt.xlabel('Year')
    plt.ylabel('Revenue')
    plt.title(f'Actual vs Predicted gp for {ticker} (Polynomial Degree: {poly_degree})')
    plt.legend()
    plt.show()

# Convert results to DataFrames
results_df = pd.DataFrame(results)
results_cv_df = pd.DataFrame(cv_results)

# Save to CSV
results_df.to_csv('gp_model_predictions.csv', index=False)

import ast
df = pd.read_csv('gp_model_predictions.csv')

# Convert the Predictions column from string representation of lists to actual lists
df['Predictions'] = df['Predictions'].apply(ast.literal_eval)

# Create a list to store the new rows for the long-format DataFrame
long_format_data = []

# Loop through each row in the original DataFrame
for _, row in df.iterrows():
    ticker = row['Ticker']  # Assuming there is a 'Ticker' column
    predictions = row['Predictions']
    
    # Loop through each prediction and create a new row for each year
    for year_idx, revenue in enumerate(predictions):
        year = 2023 + year_idx + 1  # Adjust the year as needed
        long_format_data.append([year, ticker, revenue])

# Create a new DataFrame with the long-format data
long_format_df = pd.DataFrame(long_format_data, columns=['Year', 'Ticker', 'Gross Profit'])

# Save the long-format DataFrame to a new CSV file
long_format_df.to_csv('gp_model_predictions_formatted.csv', index=False)

In [None]:
#prediction for Total Assets
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import RobustScaler, PolynomialFeatures
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt

def prepare_data(df, tickers, min_data_points=5):
    data_dict = {}
    for ticker in tickers:
        df_ticker = df[df['Ticker'] == ticker]
        df_ticker = df_ticker[['Year', 'Total Assets']].sort_values('Year')

        # Check if the ticker has at least 'min_data_points' data points
        if len(df_ticker) < min_data_points:
            print(f"Skipping {ticker} due to insufficient data points ({len(df_ticker)}).")
            continue

        scaler = RobustScaler()
        scaled_data = scaler.fit_transform(df_ticker[['Total Assets']])

        X = df_ticker['Year'].values.reshape(-1, 1)
        y = scaled_data.reshape(-1)

        # Check for NaN values in y and skip the ticker if found
        if np.isnan(y).any():
            print(f"Skipping {ticker} due to NaN values in target variable y.")
            continue

        data_dict[ticker] = {
            'X': X,
            'y': y,
            'scaler': scaler,
            'years': df_ticker['Year'].values
        }

    return data_dict

def evaluate_with_cross_validation(data_dict, tickers, degree=2, max_splits=5):
    cv_results = []

    for ticker in tickers:
        if ticker not in data_dict:  # Skip tickers that were filtered out
            continue

        X = data_dict[ticker]['X']
        y = data_dict[ticker]['y']

        # Adjust n_splits to be less than the number of data points
        n_splits = min(max_splits, len(X) - 1)
        if n_splits < 2:
            print(f"Skipping cross-validation for {ticker} due to insufficient data points for splitting.")
            continue

        tscv = TimeSeriesSplit(n_splits=n_splits)
        ticker_cv_results = []

        for train_index, test_index in tscv.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
            model.fit(X_train, y_train)

            test_loss = np.mean((model.predict(X_test) - y_test) ** 2)
            print(f"Test Loss for {ticker} fold: {test_loss}")
            ticker_cv_results.append(test_loss)

        avg_cv_loss = np.mean(ticker_cv_results)
        cv_results.append({
            'Ticker': ticker,
            'CV_Loss': avg_cv_loss
        })

    return cv_results

# Load the data
df = pd.read_csv('historical_financial_data_eodhdsecondversion.csv')

# List of tickers to include
tickers = df['Ticker'].unique()

# Prepare data with a minimum data points threshold
data_dict = prepare_data(df, tickers, min_data_points=5)

# Set the degree of the polynomial 
poly_degree = 5

# Evaluate with cross-validation
cv_results = evaluate_with_cross_validation(data_dict, tickers, degree=poly_degree)

# Initialize an empty list to store results
results = []

# Iterate over each ticker
for ticker in tickers:
    if ticker not in data_dict:  # Skip tickers that were filtered out
        continue
    
    data = data_dict[ticker]
    X, y, scaler = data['X'], data['y'], data['scaler']

    # Build and fit the model
    model = make_pipeline(PolynomialFeatures(poly_degree), LinearRegression())
    model.fit(X, y)

    # Make predictions for the next 5 years
    last_year = X[-1, 0]
    future_years = np.array(range(last_year + 1, last_year + 4)).reshape(-1, 1)
    predictions = model.predict(future_years)

    # Inverse transform predictions
    predictions_scaled = scaler.inverse_transform(predictions.reshape(-1, 1)).flatten()
    print(f"Predictions for {ticker}:", predictions_scaled)

    # Store the results
    results.append({
        'Ticker': ticker,
        'Predictions': predictions_scaled.tolist()
    })

    # Plotting the actual vs predicted values
    actual_values = scaler.inverse_transform(y.reshape(-1, 1)).flatten()
    years = data['years']

    plt.figure(figsize=(12, 6))
    plt.scatter(years, actual_values, label='Actual Values', color='blue')
    plt.plot(years, scaler.inverse_transform(model.predict(X).reshape(-1, 1)).flatten(), 
             label='Fitted Curve', color='red')
    plt.plot(future_years, predictions_scaled, label='Predictions', linestyle='--', color='green')
    plt.xlabel('Year')
    plt.ylabel('Revenue')
    plt.title(f'Actual vs Predicted nwc for {ticker} (Polynomial Degree: {poly_degree})')
    plt.legend()
    plt.show()

# Convert results to DataFrames
results_df = pd.DataFrame(results)
results_cv_df = pd.DataFrame(cv_results)

# Save to CSV
results_df.to_csv('ta_model_predictions.csv', index=False)

import ast
df = pd.read_csv('ta_model_predictions.csv')

# Convert the Predictions column from string representation of lists to actual lists
df['Predictions'] = df['Predictions'].apply(ast.literal_eval)

# Create a list to store the new rows for the long-format DataFrame
long_format_data = []

# Loop through each row in the original DataFrame
for _, row in df.iterrows():
    ticker = row['Ticker']  # Assuming there is a 'Ticker' column
    predictions = row['Predictions']
    
    # Loop through each prediction and create a new row for each year
    for year_idx, revenue in enumerate(predictions):
        year = 2023 + year_idx + 1  # Adjust the year as needed
        long_format_data.append([year, ticker, revenue])

# Create a new DataFrame with the long-format data
long_format_df = pd.DataFrame(long_format_data, columns=['Year', 'Ticker', 'Total Assets'])

# Save the long-format DataFrame to a new CSV file
long_format_df.to_csv('ta_model_predictions_formatted.csv', index=False)

All the saved csv files were merged and compiled here: https://docs.google.com/spreadsheets/d/124PCo1J2RGgyKaPO4naDsoqotDKHtOZD9wzmjizC5_Y/edit?usp=sharing