In [None]:
#Code to get historical unlevered cash flows :o
import yfinance as yf
import pandas as pd
#straightforward functions, use it if needed
def get_financial_data(ticker):
    stock = yf.Ticker(ticker)
    financials = stock.financials
    return financials

def get_cash_flow_data(ticker):
    stock = yf.Ticker(ticker)
    cash_flow = stock.cashflow
    return cash_flow

def get_income_statement_data(ticker):
    stock = yf.Ticker(ticker)
    income_statement = stock.financials
    return income_statement

def get_balance_sheet_data(ticker):
    stock = yf.Ticker(ticker)
    balance_sheet = stock.balance_sheet
    return balance_sheet
#get EBIT 
def get_ebit(ticker):
    try:
        stock = yf.Ticker(ticker)
        financials = stock.financials.T #transpose to get it in that tabular format with dates and values on right column
        if 'EBIT' not in financials.columns:
            raise ValueError("EBIT data is not available.")
        return financials['EBIT']
    except Exception as e:
        print(f"Error getting EBIT: {e}")
        return pd.Series() #holds sequence of EBIT values

def get_tax_rate(ticker):
    try:
        stock = yf.Ticker(ticker)
        financials = stock.financials.T #same transpose
        tax_rates = {}
        for date in financials.index:
            try:
                tax_expense = financials.loc[date, 'Tax Provision']
                pretax_income = financials.loc[date, 'Pretax Income']
                if pretax_income != 0:
                    tax_rate = tax_expense / pretax_income
                    tax_rates[date] = tax_rate
                else:
                    tax_rates[date] = None
            except KeyError as e:
                print(f"KeyError: {e} for date {date}")
                tax_rates[date] = None
            except ZeroDivisionError:
                tax_rates[date] = None
        tax_rates_df = pd.DataFrame(list(tax_rates.items()), columns=['Date', 'Tax Rate']) #dataframe or table of historical tax rates
        tax_rates_df.set_index('Date', inplace=True)
        return tax_rates_df['Tax Rate']
    except Exception as e:
        print(f"Error getting tax rate: {e}")
        return pd.Series()

def get_nopat(ticker):
    try:
        ebit_series = get_ebit(ticker)
        tax_rate_series = get_tax_rate(ticker)
        if ebit_series.empty or tax_rate_series.empty:
            raise ValueError("Unable to calculate NOPAT due to missing EBIT or tax rate data.")
        ebit_series = ebit_series.loc[tax_rate_series.index]
        tax_rate_series = tax_rate_series.loc[ebit_series.index]
        nopat_series = ebit_series * (1 - tax_rate_series) #match indexes (dates) and calculate based on matching index in the datafram table format
        return nopat_series
    except Exception as e:
        print(f"Error calculating NOPAT: {e}")
        return pd.Series()

def get_depreciation_amortization(ticker):
    try:
        stock = yf.Ticker(ticker)
        cashflow = stock.cashflow.T
        if 'Depreciation Amortization Depletion' not in cashflow.columns:
            raise ValueError("D&A data is not available.")
        return cashflow['Depreciation Amortization Depletion']
    except Exception as e:
        print(f"Error getting D&A: {e}")
        return pd.Series()

def get_capex(ticker):
    try:
        stock = yf.Ticker(ticker)
        cashflow = stock.cashflow.T
        if 'Capital Expenditure' not in cashflow.columns:
            raise ValueError("CapEx data is not available.")
        return cashflow['Capital Expenditure']
    except Exception as e:
        print(f"Error getting CapEx: {e}")
        return pd.Series()

def get_nwc(ticker):
    try:
        stock = yf.Ticker(ticker)
        balance_sheet = stock.balance_sheet.T
        current_assets = balance_sheet.get('Current Assets', pd.Series())
        current_liabilities = balance_sheet.get('Current Liabilities', pd.Series())
        if not current_assets.empty and not current_liabilities.empty:
            nwc = current_assets - current_liabilities
        else:
            raise ValueError("Current Assets or Current Liabilities data is missing.")
        return nwc
    except Exception as e:
        print(f"Error getting NWC: {e}")
        return pd.Series()

def calculate_nwc_increase(ticker):
    try:
        nwc_series = get_nwc(ticker)
        if nwc_series.empty:
            raise ValueError("NWC data is not available.")
        nwc_change = nwc_series.diff()
        return nwc_change
    except Exception as e:
        print(f"Error calculating NWC increase: {e}")
        return pd.Series()

def calculate_ufcf(ticker):
    try:
        nopat_series = get_nopat(ticker)
        da_series = get_depreciation_amortization(ticker)
        capex_series = get_capex(ticker)
        nwc_increase_series = calculate_nwc_increase(ticker)
        if nopat_series.empty or da_series.empty or capex_series.empty or nwc_increase_series.empty:
            raise ValueError("One or more required data series are missing.")
        common_index = nopat_series.index.intersection(da_series.index).intersection(capex_series.index).intersection(nwc_increase_series.index) #merge series values based on common index which are the dates
        #align based on that common index which is date
        nopat_series = nopat_series.loc[common_index]
        da_series = da_series.loc[common_index]
        capex_series = capex_series.loc[common_index]
        nwc_increase_series = nwc_increase_series.loc[common_index]
        # UFCF = NOPAT + Depreciation & Amortization - Increase in NWC - Capital Expenditures
        ufcs = nopat_series + da_series - nwc_increase_series - capex_series #after merging based on index, you can actually calculate
        return ufcs
    except Exception as e:
        print(f"Error calculating UFCF: {e}")
        return pd.Series()

def process_tickers():
    results = []
    tickers = pd.read_csv('AllDatav2.csv')['Ticker']
    try:
        for ticker in tickers:
            try:
                ebitda = get_ebit(ticker)
                depamo = get_depreciation_amortization(ticker)
                capex = get_capex(ticker)
                increase_in_nwc = calculate_nwc_increase(ticker)
                ufcs = calculate_ufcf(ticker)
                result = {
                    'Ticker': ticker,
                    'Ebitda': ebitda,
                    'Depreciation/Amortization': depamo,
                    'Capital Expenditures': capex,
                    'Net Working Capital Increase Per Year': increase_in_nwc,
                    'UFCF': ufcs
                }
                results.append(result)
            except Exception as e:
                print(f"Failed to process {ticker}: {e}")
        results_df = pd.DataFrame(results)
        results_df.to_csv('DCFalgorithm.csv', index=False)
    except Exception as e:
        print(f"Error processing tickers: {e}")

process_tickers()


In [None]:
import yfinance as yf
import pandas as pd

def get_historical_tax_rates(ticker):
    # Fetch financial data
    stock = yf.Ticker(ticker)
    financials = stock.financials.T  

    # Initialize a dictionary to hold tax rates for each period
    tax_rates = {}

    # Calculate tax rate for each period
    for date in financials.index:
        try:
            tax_expense = financials.loc[date, 'Tax Provision']
            pretax_income = financials.loc[date, 'Pretax Income']
            
            # Avoid division by zero
            if pretax_income != 0:
                tax_rate = tax_expense / pretax_income
                tax_rates[date] = tax_rate
            else:
                tax_rates[date] = None
        except KeyError as e:
            print(f"KeyError: {e} for date {date}")
            tax_rates[date] = None
        except ZeroDivisionError:
            tax_rates[date] = None

    # Convert the dictionary to a DataFrame for easier analysis
    tax_rates_df = pd.DataFrame(list(tax_rates.items()), columns=['Date', 'Tax Rate'])
    tax_rates_df.set_index('Date', inplace=True)
    
    return tax_rates_df

# Example usage
ticker = 'AAPL'
tax_rates_df = get_historical_tax_rates(ticker)
print(tax_rates_df)


In [None]:
import pandas as pd

df = pd.read_csv('DCFalgorithm.csv', index_col='Ticker')

print(df.head())
print(df.info())

In [None]:
#file clean up
import pandas as pd
import re

def extract_numbers(cell):
    # Remove the "Name:" and "dtype:" parts
    cell = re.sub(r'Name:.*?dtype:.*', '', str(cell))
    
    # Find all numbers (including negative numbers and NaN), excluding date components
    numbers = re.findall(r'-?\d+(?:\.\d+)?(?:e[-+]?\d+)?|\bNaN\b', cell)
    
    # Filter out numbers that are likely to be years, months, or days
    numbers = [num for num in numbers if not (abs(float(num.replace('NaN', '0'))) <= 31 or 1900 <= abs(float(num.replace('NaN', '0'))) <= 2100)]
    
    # Replace 'NaN' with '0'
    numbers = ['0' if num.lower() == 'nan' else num for num in numbers]
    
    return numbers

# Read the CSV file
df = pd.read_csv('DCFalgorithm.csv')

# Process each column
for column in df.columns:
    # Extract numbers from the column
    extracted = df[column].apply(extract_numbers)
    
    # Find the maximum number of values in any row
    max_values = extracted.apply(len).max()
    
    # Create new columns for each value
    for i in range(max_values):
        df[f'{column}_{i+1}'] = extracted.apply(lambda x: x[i] if i < len(x) else None)
    
    # Drop the original column
    df = df.drop(columns=[column])

# Display the results
print(df)

# If you want to save the results to a new CSV file
df.to_csv('processed_data5.csv', index=False)

In [None]:
import pandas as pd

# Load the data
file_path = 'processed_data6.csv'  
data = pd.read_csv(file_path)

# Columns to reshape
metrics = [
    'Ebitda', 'Depreciation/Amortization', 'Capital Expenditures', 
    'Net Working Capital Increase Per Year', 'UFCF'
]

# Reshape data to long format
data_long = pd.wide_to_long(data, stubnames=metrics, i='Ticker', j='Year', sep='_', suffix='\d+').reset_index()

# Sorting by Ticker and Year
data_long = data_long.sort_values(by=['Ticker', 'Year'])

# Display the transformed data
print(data_long.head())

output_path = 'processed_data_ml.csv'  
data_long.to_csv(output_path, index=False)

import os
if os.path.exists(output_path):
    print(f'Transformed data has been saved to {output_path}')
else:
    print('Error in saving the transformed data')


In [None]:
#neural network trained to predict Unlevered Cash Flows using financial statement data and valuation/stock metrics!!!! Extremeley low error!!!

import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error
from sklearn.inspection import permutation_importance
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
import joblib

# Load valuation metrics and processed data
valuation_metrics_path = 'valuation_metrics_official.csv'
processed_data_path = 'processed_data_ml.csv'
additional_data_path = 'AllDatav2.csv'

valuation_metrics_df = pd.read_csv(valuation_metrics_path)
processed_data_df = pd.read_csv(processed_data_path)
additional_data_df = pd.read_csv(additional_data_path)

# Convert integer Year values to actual years
year_mapping = {1: 2021, 2: 2022, 3: 2023, 4: 2024}
processed_data_df['Year'] = processed_data_df['Year'].map(year_mapping)

# Sort by Ticker and Year to ensure temporal order
processed_data_df = processed_data_df.sort_values(by=['Ticker', 'Year'])

# Label encoding for Ticker
le = LabelEncoder()
processed_data_df['Ticker'] = le.fit_transform(processed_data_df['Ticker'])
valuation_metrics_df['Ticker'] = le.transform(valuation_metrics_df['Ticker'])
additional_data_df['Ticker'] = le.transform(additional_data_df['Ticker'])

# Merge the additional metrics
data_df = pd.merge(processed_data_df, valuation_metrics_df, on='Ticker', how='left')
data_df = pd.merge(data_df, additional_data_df, on='Ticker', how='left')

# Create lag features and rolling means
data_df['Lag_Ebitda'] = data_df.groupby('Ticker')['Ebitda'].shift(1)
data_df['Lag_Depreciation'] = data_df.groupby('Ticker')['Depreciation/Amortization'].shift(1)
data_df['Lag_CapEx'] = data_df.groupby('Ticker')['Capital Expenditures'].shift(1)
data_df['Lag_NWC'] = data_df.groupby('Ticker')['Net Working Capital Increase Per Year'].shift(1)

data_df['Rolling_Mean_Ebitda'] = data_df.groupby('Ticker')['Ebitda'].rolling(window=2).mean().reset_index(0, drop=True)
data_df['Rolling_Mean_Depreciation'] = data_df.groupby('Ticker')['Depreciation/Amortization'].rolling(window=2).mean().reset_index(0, drop=True)
data_df['Rolling_Mean_CapEx'] = data_df.groupby('Ticker')['Capital Expenditures'].rolling(window=2).mean().reset_index(0, drop=True)
data_df['Rolling_Mean_NWC'] = data_df.groupby('Ticker')['Net Working Capital Increase Per Year'].rolling(window=2).mean().reset_index(0, drop=True)

# Drop rows with NaN values resulting from shifting
data_df = data_df.dropna()

# Define features and target
X = data_df.drop(columns=['UFCF'])
y = data_df['UFCF']

# Handle missing values
X = X.fillna(0)
y = y.fillna(0)

# Fetch and process financial statements for additional features
def fetch_financial_data(ticker):
    stock = yf.Ticker(ticker)
    financials = stock.financials
    balance_sheet = stock.balance_sheet
    cashflow = stock.cashflow
    
    data = {
        'Revenue': financials.loc['Total Revenue'].values[0] if 'Total Revenue' in financials.index else 0,
        'COGS': financials.loc['Cost Of Revenue'].values[0] if 'Cost Of Revenue' in financials.index else 0,
        'Operating Expenses': financials.loc['Operating Expense'].values[0] if 'Operating Expense' in financials.index else 0,
        'Depreciation': cashflow.loc['Depreciation And Amortization'].values[0] if 'Depreciation And Amortization' in cashflow.index else 0,
        'CapEx': cashflow.loc['Capital Expenditure'].values[0] if 'Capital Expenditure' in cashflow.index else 0,
        'Gross Profit': financials.loc['Gross Profit'].values[0] if 'Gross Profit' in financials.index else 0,
        'EBIT': financials.loc['EBIT'].values[0] if 'EBIT' in financials.index else 0,
        'EBITDA': financials.loc['EBITDA'].values[0] if 'EBITDA' in financials.index else 0,
        'Total Assets': balance_sheet.loc['Total Assets'].values[0] if 'Total Assets' in balance_sheet.index else 0,
        'Total Liabilities': balance_sheet.loc['Total Liabilities Net Minority Interest'].values[0] if 'Total Liabilities Net Minority Interest' in balance_sheet.index else 0,
        'Total Equity': balance_sheet.loc['Total Equity Gross Minority Interest'].values[0] if 'Total Equity Gross Minority Interest' in balance_sheet.index else 0,
        'Net Income': financials.loc['Net Income'].values[0] if 'Net Income' in financials.index else 0,
        'Operating Cash Flow': cashflow.loc['Cash Flow From Continuing Operating Activities'].values[0] if 'Cash Flow From Continuing Operating Activities' in cashflow.index else 0,
        'Free Cash Flow': cashflow.loc['Free Cash Flow'].values[0] if 'Free Cash Flow' in cashflow.index else 0
    }
    
    return pd.DataFrame([data])

financial_features_list = [fetch_financial_data(ticker) for ticker in le.classes_]

# Concatenate all data
financial_features_df = pd.concat(financial_features_list, ignore_index=True)

# Integrate financial data with main dataset
data_df = pd.concat([data_df.reset_index(drop=True), financial_features_df.reset_index(drop=True)], axis=1)

# Define features and target again with additional financial data
X = data_df.drop(columns=['UFCF'])
y = data_df['UFCF']

# Handle any potential missing values
X = X.fillna(0)
y = y.fillna(0)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

joblib.dump(scaler, 'scaler2.pkl') #save scaler

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define the neural network model
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.3))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(16, activation='relu'))
model.add(Dense(1))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate the model
y_pred = model.predict(X_test).flatten()

# Ensure there are no NaN values in predictions
if np.any(np.isnan(y_pred)):
    raise ValueError("Predictions contain NaN values.")

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = np.mean(np.abs(y_test - y_pred))

print(f"Test MSE: {mse}")
print(f"Test RMSE: {rmse}")
print(f"Test MAE: {mae}")

# Plot training history
plt.figure(figsize=(12, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

# Custom scorer for permutation importance
def custom_scorer(estimator, X, y):
    y_pred = estimator.predict(X).flatten()
    return -mean_squared_error(y, y_pred)

# Compute permutation feature importance
result = permutation_importance(model, X_test, y_test, scoring=custom_scorer, n_repeats=10, random_state=42, n_jobs=-1)

# Feature importance plot
# Create a dataframe for feature importances
feature_importances_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': result.importances_mean
}).sort_values(by='Importance', ascending=False)

# Plot
plt.figure(figsize=(12, 8))
plt.barh(feature_importances_df['Feature'], feature_importances_df['Importance'], color='skyblue')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importances')
plt.gca().invert_yaxis()
plt.show()

# Identify features with negative importance scores
negative_importance_features = feature_importances_df[feature_importances_df['Importance'] < 0]['Feature'].tolist()

# Drop these features from the dataset
X_filtered = data_df.drop(columns=['UFCF'] + negative_importance_features)
y_filtered = data_df['UFCF']

# Handle any potential missing values
X_filtered = X_filtered.fillna(0)
y_filtered = y_filtered.fillna(0)

# Scale features
X_filtered_scaled = scaler.fit_transform(X_filtered)

# Split the data
X_train_filtered, X_test_filtered, y_train_filtered, y_test_filtered = train_test_split(X_filtered_scaled, y_filtered, test_size=0.2, random_state=42)

# Define the neural network model again for filtered data
model_filtered = Sequential()
model_filtered.add(Dense(64, activation='relu', input_shape=(X_train_filtered.shape[1],)))
model_filtered.add(Dropout(0.3))
model_filtered.add(Dense(32, activation='relu'))
model_filtered.add(Dropout(0.3))
model_filtered.add(Dense(16, activation='relu'))
model_filtered.add(Dense(1))

# Compile the model
model_filtered.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

# Train the model
history_filtered = model_filtered.fit(X_train_filtered, y_train_filtered, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate the model
y_pred_filtered = model_filtered.predict(X_test_filtered).flatten()

# Ensure there are no NaN values in predictions
if np.any(np.isnan(y_pred_filtered)):
    raise ValueError("Predictions contain NaN values.")

mse_filtered = mean_squared_error(y_test_filtered, y_pred_filtered)
rmse_filtered = np.sqrt(mse_filtered)
mae_filtered = np.mean(np.abs(y_test_filtered - y_pred_filtered))

print(f"Filtered Test MSE: {mse_filtered}")
print(f"Filtered Test RMSE: {rmse_filtered}")
print(f"Filtered Test MAE: {mae_filtered}")

# Plot training history for filtered model
plt.figure(figsize=(12, 6))
plt.plot(history_filtered.history['loss'], label='Filtered Training Loss')
plt.plot(history_filtered.history['val_loss'], label='Filtered Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss for Filtered Model')
plt.legend()
plt.show()

# Save the entire model
model.save('UFCFmodelrevised.h5') 






In [None]:
#testing with random data!!!!!

In [None]:
import yfinance as yf
import pandas as pd
import numpy as np

def get_historical_data(ticker, start_date, end_date):
    stock = yf.Ticker(ticker)
    
    # Fetch quarterly financial data
    income_stmt = stock.quarterly_financials
    balance_sheet = stock.quarterly_balance_sheet
    cash_flow = stock.quarterly_cashflow
    
    # Extract relevant metrics with error handling
    def safe_get(df, keys):
        for key in keys:
            if key in df.index:
                return df.loc[key]
        return pd.Series(np.nan, index=df.columns)

    revenue = safe_get(income_stmt, ['Total Revenue', 'Revenue'])
    ebitda = safe_get(income_stmt, ['EBITDA', 'EBIT'])
    capex = safe_get(cash_flow, ['Capital Expenditure', 'Capital Expenditures', 'Property Plant Equipment'])
    
    # Calculate Net Working Capital (Current Assets - Current Liabilities)
    current_assets = safe_get(balance_sheet, ['Total Current Assets', 'Total Assets'])
    current_liabilities = safe_get(balance_sheet, ['Total Current Liabilities', 'Total Liabilities Net Minority Interest'])
    nwc = current_assets - current_liabilities
    
    # Combine all metrics into a single DataFrame
    data = pd.concat([revenue, ebitda, capex, nwc], axis=1)
    data.columns = ['Revenue', 'EBITDA', 'CapEx', 'NWC']
    
    # Calculate UFCF (Unlevered Free Cash Flow)
    data['UFCF'] = data['EBITDA'] - data['CapEx'] - data['NWC'].diff()
    
    return data

# List of tickers you want to analyze
tickers = pd.read_csv('AllDatav2.csv')['Ticker'].tolist()  # Convert to list

# Fetch data for each ticker and combine
all_data = []
for ticker in tickers:
    try:
        data = get_historical_data(ticker, '2010-01-01', '2023-12-31')
        data['Ticker'] = ticker
        all_data.append(data)
        print(f"Successfully processed {ticker}")
    except Exception as e:
        print(f"Error processing {ticker}: {str(e)}")

# Combine all data into a single DataFrame
combined_data = pd.concat(all_data, axis=0)

# Reset index to turn dates into a column
combined_data = combined_data.reset_index()
combined_data.columns = ['Date'] + list(combined_data.columns[1:])

# Save to CSV
combined_data.to_csv('historical_data.csv', index=False)
print("Data saved to 'historical_data.csv'")

#remmebe that it is historical data so 20 quarters =  5years haha

In [None]:
#in historical_data csv, make sure to use backfill/frontfill to NaN values (preserves trend for LSTM model but unfortunately won't be perfect accuracy when using in ML model)

import pandas as pd
import numpy as np

# Load the CSV file
df = pd.read_csv('historical_data.csv', parse_dates=['Date'])

# Set Date as index
df.set_index('Date', inplace=True)

# List of columns to fill
columns_to_fill = ['Revenue', 'EBITDA', 'CapEx', 'NWC', 'UFCF']

# Apply backward fill to each column
for col in columns_to_fill:
    df[col] = df[col].bfill()

# Reset index to make Date a column again
df.reset_index(inplace=True)

print(df)

df.to_csv('historical_data_updated.csv', index=False)


In [1]:
import pandas as pd

# Load the data
df = pd.read_csv('historical_data_final.csv')

# Convert the Date column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Sort the data by Ticker and Date
df = df.sort_values(by=['Ticker', 'Date'])

# Save the sorted data to a new CSV file
df.to_csv('historical_data_final2.csv', index=False)

# Display the first few rows to verify sorting
print(df.head())


         Date     Revenue      EBITDA      CapEx          NWC        UFCF  \
14 2022-11-30  4816000000  1896000000 -121000000  14838000000  1141000000   
15 2023-02-28  4816000000  1896000000 -121000000  14838000000  1141000000   
16 2023-05-31  4816000000  1896000000 -121000000  14838000000  1141000000   
17 2023-08-31  4890000000  1988000000  -91000000  15776000000  1141000000   
18 2023-11-30  5048000000  2058000000  -47000000  16518000000  1363000000   

   Ticker  
14   ADBE  
15   ADBE  
16   ADBE  
17   ADBE  
18   ADBE  


In [9]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

# Load the historical revenue data
df = pd.read_csv('historical_data_final2.csv')

# Convert the Date column to datetime and sort by date
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(by='Date')

# Extract unique tickers
tickers = df['Ticker'].unique()

# Function to fetch and prepare financial data using yfinance
def fetch_financial_data(ticker):
    stock = yf.Ticker(ticker)
    
    # Fetch income statement
    income_statement = stock.financials.T
    income_statement = income_statement.reset_index().rename(columns={'index': 'Date'})
    income_statement['Date'] = pd.to_datetime(income_statement['Date'])
    
    # Fetch balance sheet
    balance_sheet = stock.balance_sheet.T
    balance_sheet = balance_sheet.reset_index().rename(columns={'index': 'Date'})
    balance_sheet['Date'] = pd.to_datetime(balance_sheet['Date'])
    
    # Fetch cash flow statement
    cash_flow = stock.cashflow.T
    cash_flow = cash_flow.reset_index().rename(columns={'index': 'Date'})
    cash_flow['Date'] = pd.to_datetime(cash_flow['Date'])

    # Combine into a single DataFrame
    financial_data = pd.merge(income_statement, balance_sheet, on='Date', how='outer')
    financial_data = pd.merge(financial_data, cash_flow, on='Date', how='outer')
    
    return financial_data

# Prepare data with additional financial features
def prepare_data(df, ticker, n_train):
    # Filter historical revenue data for the specific ticker
    df_ticker = df[df['Ticker'] == ticker]
    df_ticker = df_ticker[['Date', 'Revenue']]

    # Fetch financial data
    financial_data = fetch_financial_data(ticker)
    
    # Merge revenue data with financial data
    df_combined = pd.merge(df_ticker, financial_data, on='Date', how='left').dropna()

    # Ensure there are enough data points
    if len(df_combined) < 2 * n_train:
        raise ValueError(f"Not enough data points for ticker {ticker} to create sequences.")

    # Split the data based on the number of samples
    train = df_combined[:n_train]
    test = df_combined[n_train:n_train*2]

    # Normalize the features
    scaler = MinMaxScaler(feature_range=(0, 1))
    train_scaled = scaler.fit_transform(train.drop(columns=['Date']))
    test_scaled = scaler.transform(test.drop(columns=['Date']))

    return train_scaled, test_scaled, scaler, train, test

# Define the number of training samples
n_train_samples = 3

# Create models for each ticker
models = {}
results = []

for ticker in tickers:
    try:
        train_scaled, test_scaled, scaler, train, test = prepare_data(df, ticker, n_train_samples)
        
        # Determine sequence length based on available data
        sequence_length = min(n_train_samples, len(train_scaled) - 1)
        
        # Create sequences for LSTM
        generator = TimeseriesGenerator(train_scaled, train_scaled[:, 0], length=sequence_length, batch_size=1)
        
        # Build the model
        model = Sequential([
            LSTM(50, activation='relu', input_shape=(sequence_length, train_scaled.shape[1])),
            Dense(1)
        ])
        model.compile(optimizer='adam', loss='mean_squared_error')
        
        # Train the model
        model.fit(generator, epochs=10, verbose=1)
        
        # Save the model
        models[ticker] = model

        # Predict the next 5 quarters
        last_sequence = np.array([train_scaled[-sequence_length:]])
        predictions_scaled = []
        
        for _ in range(5):
            # Make prediction
            prediction = model.predict(last_sequence)[0][0]
            predictions_scaled.append(prediction)
            
            # Update last_sequence with new prediction
            new_sequence = np.concatenate((last_sequence[:, 1:, :], [[prediction]]), axis=1)

        # Inverse transform predictions
        predictions = scaler.inverse_transform(np.array(predictions_scaled).reshape(-1, 1))
        
        # Evaluate the model
        mae = mse = rmse = None
        if len(test) > sequence_length:
            test_sequences = TimeseriesGenerator(test_scaled, test_scaled[:, 0], length=sequence_length, batch_size=1)
            y_true = []
            y_pred = []
            
            for i in range(len(test_sequences)):
                x, y = test_sequences[i]
                y_true.append(y[0])
                y_pred.append(model.predict(x)[0][0])
            
            y_true = np.array(y_true)
            y_pred = np.array(y_pred)
            
            mae = mean_absolute_error(y_true, y_pred)
            mse = mean_squared_error(y_true, y_pred)
            rmse = np.sqrt(mse)
            
            print(f"Metrics for {ticker}:")
            print(f"  MAE: {mae}")
            print(f"  MSE: {mse}")
            print(f"  RMSE: {rmse}")
        else:
            print(f"Not enough test data for evaluation for ticker {ticker}")
        
        print(f"Predictions for {ticker}: {predictions.flatten()}")

        # Store results
        for i, pred in enumerate(predictions.flatten(), start=1):
            results.append([ticker, f'Q{i}', pred, mae, mse, rmse])

    except ValueError as e:
        print(e)
        print(f"Skipping ticker {ticker} due to insufficient data.")

# Create DataFrame from results
results_df = pd.DataFrame(results, columns=['Ticker', 'Quarter', 'Prediction', 'MAE', 'MSE', 'RMSE'])

# Save to CSV
results_df.to_csv('RevenuePredictions.csv', index=False)


Not enough data points for ticker ADBE to create sequences.
Skipping ticker ADBE due to insufficient data.
Not enough data points for ticker ORCL to create sequences.
Skipping ticker ORCL due to insufficient data.
Not enough data points for ticker KARO to create sequences.
Skipping ticker KARO due to insufficient data.


KeyboardInterrupt: 

In [5]:
#LSTM model to predict revenues based on past financial statements
import yfinance as yf
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

aa = pd.read_csv('AllDatav2.csv')

# Define a list of tickers
tickers = aa['Ticker'].unique().tolist()

# Function to load revenue data using yfinance
def load_financial_data(ticker):
    try:
        # Fetch the financial data
        stock = yf.Ticker(ticker)
        financials = stock.financials.transpose()  # Transpose to have dates as index
        
        # Extract the Revenue data
        if 'Total Revenue' not in financials.columns:
            raise ValueError(f"Revenue data not available for ticker {ticker}")
        
        revenue_data = financials[['Total Revenue']]
        revenue_data = revenue_data.dropna().reset_index()
        revenue_data.columns = ['Date', 'Revenue']
        revenue_data['Date'] = pd.to_datetime(revenue_data['Date'])
        revenue_data['Ticker'] = ticker
        
        return revenue_data
    except Exception as e:
        print(f"Error fetching data for {ticker}: {e}")
        return None

# Prepare data function (modified to handle yfinance data)
def prepare_data(df, ticker, cutoff_date):
    df_ticker = df[df['Ticker'] == ticker]
    df_ticker = df_ticker[['Date', 'Revenue']]
    
    # Split the data
    train = df_ticker[df_ticker['Date'] < cutoff_date]
    test = df_ticker[df_ticker['Date'] >= cutoff_date]
    
    if len(train) < 3:
        raise ValueError(f"Not enough data points for ticker {ticker} to create sequences.")
    
    # Normalize the revenue
    scaler = MinMaxScaler(feature_range=(0, 1))
    train_scaled = scaler.fit_transform(train[['Revenue']])
    test_scaled = scaler.transform(test[['Revenue']])
    
    return train_scaled, test_scaled, scaler, train, test

# Define the cutoff date
cutoff_date = '2022-06-30'

# Create models for each ticker
models = {}
results = []
all_data = []

# Load data for each ticker
for ticker in tickers:
    data = load_financial_data(ticker)
    if data is not None:
        all_data.append(data)
    time.sleep(2)

# Combine all data into one DataFrame
df = pd.concat(all_data)

# Process data and train models
for ticker in tickers:
    try:
        train_scaled, test_scaled, scaler, train, test = prepare_data(df, ticker, cutoff_date)
        
        # Determine sequence length based on available data
        sequence_length = min(2, len(train_scaled) - 1)
        
        # Create sequences for LSTM
        generator = TimeseriesGenerator(train_scaled, train_scaled, length=sequence_length, batch_size=1)
        
        # Build the model
        model = Sequential([
            LSTM(50, activation='relu', input_shape=(sequence_length, 1)),
            Dense(1)
        ])
        model.compile(optimizer='adam', loss='mean_squared_error')
        
        # Train the model
        model.fit(generator, epochs=10, verbose=1)
        
        # Save the model
        models[ticker] = model

        # Predict the next 5 quarters
        last_sequence = np.array([train_scaled[-sequence_length:]])
        predictions_scaled = []
        
        for _ in range(5):
            # Make prediction
            prediction = model.predict(last_sequence)[0][0]
            predictions_scaled.append(prediction)
            
            # Update last_sequence with new prediction
            new_sequence = np.array([[[prediction]]])
            last_sequence = np.concatenate((last_sequence[:, 1:, :], new_sequence), axis=1)

        # Inverse transform predictions
        predictions = scaler.inverse_transform(np.array(predictions_scaled).reshape(-1, 1))
        
        # Evaluate the model
        mae = mse = rmse = None
        if len(test) > sequence_length:
            test_sequences = TimeseriesGenerator(test_scaled, test_scaled, length=sequence_length, batch_size=1)
            y_true = []
            y_pred = []
            
            for i in range(len(test_sequences)):
                x, y = test_sequences[i]
                y_true.append(y[0, 0])
                y_pred.append(model.predict(x)[0, 0])
            
            y_true = np.array(y_true)
            y_pred = np.array(y_pred)
            
            mae = mean_absolute_error(y_true, y_pred)
            mse = mean_squared_error(y_true, y_pred)
            rmse = np.sqrt(mse)
            
            print(f"Metrics for {ticker}:")
            print(f"  MAE: {mae}")
            print(f"  MSE: {mse}")
            print(f"  RMSE: {rmse}")
        else:
            print(f"Not enough test data for evaluation for ticker {ticker}")
        
        print(f"Predictions for {ticker}: {predictions.flatten()}")

        # Store results
        for i, pred in enumerate(predictions.flatten(), start=1):
            results.append([ticker, f'Q{i}', pred, mae, mse, rmse])

    except ValueError as e:
        print(e)
        print(f"Skipping ticker {ticker} due to insufficient data.")

# Create DataFrame from results
results_df = pd.DataFrame(results, columns=['Ticker', 'Quarter', 'Prediction', 'MAE', 'MSE', 'RMSE'])

# Save to CSV
results_df.to_csv('Revenue_predictions.csv', index=False)


KeyboardInterrupt: 

In [9]:
import yfinance as yf
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from concurrent.futures import ThreadPoolExecutor, as_completed
import logging

# Set up logging to track the progress
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

aa = pd.read_csv('AllDatav2.csv')

# Define a list of tickers
tickers = aa['Ticker'].unique().tolist()

# Function to load revenue data using yfinance with retry logic
def load_financial_data(ticker):
    for attempt in range(3):  # Retry up to 3 times
        try:
            # Fetch the financial data
            stock = yf.Ticker(ticker)
            financials = stock.financials.transpose()  # Transpose to have dates as index
            
            # Extract the Revenue data
            if 'Total Revenue' not in financials.columns:
                raise ValueError(f"Revenue data not available for ticker {ticker}")
            
            revenue_data = financials[['Total Revenue']]
            revenue_data = revenue_data.reset_index()
            revenue_data.columns = ['Date', 'Revenue']
            revenue_data['Date'] = pd.to_datetime(revenue_data['Date'])
            revenue_data['Ticker'] = ticker
            
            logging.info(f"Successfully fetched data for {ticker}")
            return revenue_data
        except Exception as e:
            logging.warning(f"Attempt {attempt + 1} failed for {ticker}: {e}")
            time.sleep(5)  # Wait before retrying
    logging.error(f"Failed to fetch data for {ticker} after 3 attempts")
    return None

# Load data for each ticker in parallel
def load_all_data(tickers):
    all_data = []
    with ThreadPoolExecutor(max_workers=5) as executor:  # Adjust number of workers as needed
        future_to_ticker = {executor.submit(load_financial_data, ticker): ticker for ticker in tickers}
        for future in as_completed(future_to_ticker):
            ticker = future_to_ticker[future]
            try:
                data = future.result()
                if data is not None:
                    all_data.append(data)
            except Exception as e:
                logging.error(f"Error processing ticker {ticker}: {e}")
    return all_data

all_data = load_all_data(tickers)

# Combine all data into one DataFrame
df = pd.concat(all_data)

# Standardize the start date
common_start_date = df.groupby('Ticker')['Date'].min().max()
df = df[df['Date'] >= common_start_date]

# Forward fill missing values and backfill if necessary
df = df.groupby('Ticker').apply(lambda group: group.set_index('Date').asfreq('M').ffill().bfill()).reset_index()

# Prepare data function (modified to handle yfinance data)
def prepare_data(df, ticker, cutoff_date):
    df_ticker = df[df['Ticker'] == ticker]
    df_ticker = df_ticker[['Date', 'Revenue']]
    
    # Split the data
    train = df_ticker[df_ticker['Date'] < cutoff_date]
    test = df_ticker[df_ticker['Date'] >= cutoff_date]
    
    if len(train) < 3:
        raise ValueError(f"Not enough data points for ticker {ticker} to create sequences.")
    
    # Normalize the revenue
    scaler = MinMaxScaler(feature_range=(0, 1))
    train_scaled = scaler.fit_transform(train[['Revenue']])
    test_scaled = scaler.transform(test[['Revenue']])
    
    return train_scaled, test_scaled, scaler, train, test

# Define the cutoff date
cutoff_date = '2022-06-30'

# Create models for each ticker
models = {}
results = []

# Process data and train models
for ticker in tickers:
    try:
        logging.info(f"Processing ticker {ticker}")
        train_scaled, test_scaled, scaler, train, test = prepare_data(df, ticker, cutoff_date)
        
        # Determine sequence length based on available data
        sequence_length = min(10, len(train_scaled) - 1)  # Increased sequence length to 10
        
        # Create sequences for LSTM
        generator = TimeseriesGenerator(train_scaled, train_scaled, length=sequence_length, batch_size=16)
        
        # Build the model
        model = Sequential([
            LSTM(50, activation='relu', input_shape=(sequence_length, 1)),
            Dense(1)
        ])
        model.compile(optimizer='adam', loss='mean_squared_error')
        
        # Train the model
        model.fit(generator, epochs=5, verbose=1)  # Reduced epochs for quicker testing
        
        # Save the model
        models[ticker] = model

        # Predict the next 5 quarters
        last_sequence = np.array([train_scaled[-sequence_length:]])
        predictions_scaled = []
        
        for _ in range(5):
            # Make prediction
            prediction = model.predict(last_sequence)[0][0]
            if not np.isfinite(prediction):  # Check if the prediction is a valid number
                raise ValueError(f"Invalid prediction for ticker {ticker}")
            predictions_scaled.append(prediction)
            
            # Update last_sequence with new prediction
            new_sequence = np.array([[[prediction]]])
            last_sequence = np.concatenate((last_sequence[:, 1:, :], new_sequence), axis=1)

        # Inverse transform predictions
        predictions = scaler.inverse_transform(np.array(predictions_scaled).reshape(-1, 1))
        
        # Evaluate the model
        mae = mse = rmse = None
        if len(test) > sequence_length:
            test_sequences = TimeseriesGenerator(test_scaled, test_scaled, length=sequence_length, batch_size=1)
            y_true = []
            y_pred = []
            
            for i in range(len(test_sequences)):
                x, y = test_sequences[i]
                y_true.append(y[0, 0])
                y_pred.append(model.predict(x)[0, 0])
            
            y_true = np.array(y_true)
            y_pred = np.array(y_pred)
            
            mae = mean_absolute_error(y_true, y_pred)
            mse = mean_squared_error(y_true, y_pred)
            rmse = np.sqrt(mse)
            
            logging.info(f"Metrics for {ticker} - MAE: {mae}, MSE: {mse}, RMSE: {rmse}")
        else:
            logging.warning(f"Not enough test data for evaluation for ticker {ticker}")
        
        logging.info(f"Predictions for {ticker}: {predictions.flatten()}")

        # Store results
        for i, pred in enumerate(predictions.flatten(), start=1):
            results.append([ticker, f'Q{i}', pred, mae, mse, rmse])

    except ValueError as e:
        logging.error(f"Error for ticker {ticker}: {e}")
        logging.info(f"Skipping ticker {ticker} due to insufficient data.")

# Create DataFrame from results
results_df = pd.DataFrame(results, columns=['Ticker', 'Quarter', 'Prediction', 'MAE', 'MSE', 'RMSE'])

# Save to CSV
results_df.to_csv('Revenue_predictions.csv', index=False)


2024-08-09 13:39:58,308 - INFO - Successfully fetched data for CRM
2024-08-09 13:39:58,373 - INFO - Successfully fetched data for ADBE
2024-08-09 13:39:58,378 - INFO - Successfully fetched data for ORCL
2024-08-09 13:39:58,392 - INFO - Successfully fetched data for INTU
2024-08-09 13:39:58,411 - INFO - Successfully fetched data for NFLX
2024-08-09 13:39:59,481 - INFO - Successfully fetched data for PANW
2024-08-09 13:39:59,491 - INFO - Successfully fetched data for ADP
2024-08-09 13:39:59,529 - INFO - Successfully fetched data for CRWD
2024-08-09 13:39:59,557 - INFO - Successfully fetched data for NOW
2024-08-09 13:39:59,716 - INFO - Successfully fetched data for SHOP
2024-08-09 13:40:00,247 - INFO - Successfully fetched data for WDAY
2024-08-09 13:40:00,515 - INFO - Successfully fetched data for SPOT
2024-08-09 13:40:00,605 - INFO - Successfully fetched data for ADSK
2024-08-09 13:40:00,651 - INFO - Successfully fetched data for PLTR
2024-08-09 13:40:00,721 - INFO - Successfully fetch

ValueError: cannot insert Ticker, already exists

In [2]:
import pandas as pd

# Load the CSV file
file_path = 'historical_data_final2.csv'
df = pd.read_csv(file_path)

# Display the first few rows to understand the structure and content
df.head(), df['Ticker'].value_counts()

(         Date     Revenue      EBITDA      CapEx          NWC        UFCF  \
 0  2022-11-30  4816000000  1896000000 -121000000  14838000000  1141000000   
 1  2023-02-28  4816000000  1896000000 -121000000  14838000000  1141000000   
 2  2023-05-31  4816000000  1896000000 -121000000  14838000000  1141000000   
 3  2023-08-31  4890000000  1988000000  -91000000  15776000000  1141000000   
 4  2023-11-30  5048000000  2058000000  -47000000  16518000000  1363000000   
 
   Ticker  
 0   ADBE  
 1   ADBE  
 2   ADBE  
 3   ADBE  
 4   ADBE  ,
 Ticker
 ADBE    7
 VRNS    7
 CFLT    7
 PEGA    7
 CHKP    7
        ..
 GDDY    6
 GEN     6
 ZUO     6
 IOT     5
 ADSK    5
 Name: count, Length: 173, dtype: int64)

In [None]:
#LSTM model for EBITDA

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

# Load the data
df = pd.read_csv('historical_data_final.csv')

# Convert the Date column to datetime and sort by date
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(by='Date')

# Extract unique tickers
tickers = df['Ticker'].unique()

# Prepare data
def prepare_data(df, ticker, cutoff_date):
    df_ticker = df[df['Ticker'] == ticker]
    df_ticker = df_ticker[['Date', 'EBITDA']]
    
    # Split the data
    train = df_ticker[df_ticker['Date'] < cutoff_date]
    test = df_ticker[df_ticker['Date'] >= cutoff_date]
    
    if len(train) < 3:
        raise ValueError(f"Not enough data points for ticker {ticker} to create sequences.")
    
    # Normalize the revenue
    scaler = MinMaxScaler(feature_range=(0, 1))
    train_scaled = scaler.fit_transform(train[['EBITDA']])
    test_scaled = scaler.transform(test[['EBITDA']])
    
    return train_scaled, test_scaled, scaler, train, test

# Define the cutoff date
cutoff_date = '2023-11-30'

# Create models for each ticker
models = {}
results = []

for ticker in tickers:
    try:
        train_scaled, test_scaled, scaler, train, test = prepare_data(df, ticker, cutoff_date)
        
        # Determine sequence length based on available data
        sequence_length = min(2, len(train_scaled) - 1)
        
        # Create sequences for LSTM
        generator = TimeseriesGenerator(train_scaled, train_scaled, length=sequence_length, batch_size=1)
        
        # Build the model
        model = Sequential([
            LSTM(50, activation='relu', input_shape=(sequence_length, 1)),
            Dense(1)
        ])
        model.compile(optimizer='adam', loss='mean_squared_error')
        
        # Train the model
        model.fit(generator, epochs=10, verbose=1)
        
        # Save the model
        models[ticker] = model

        # Predict the next 5 quarters
        last_sequence = np.array([train_scaled[-sequence_length:]])
        predictions_scaled = []
        
        for _ in range(5):
            # Make prediction
            prediction = model.predict(last_sequence)[0][0]
            predictions_scaled.append(prediction)
            
            # Update last_sequence with new prediction
            new_sequence = np.array([[[prediction]]])
            last_sequence = np.concatenate((last_sequence[:, 1:, :], new_sequence), axis=1)

        # Inverse transform predictions
        predictions = scaler.inverse_transform(np.array(predictions_scaled).reshape(-1, 1))
        
        # Evaluate the model
        mae = mse = rmse = None
        if len(test) > sequence_length:
            test_sequences = TimeseriesGenerator(test_scaled, test_scaled, length=sequence_length, batch_size=1)
            y_true = []
            y_pred = []
            
            for i in range(len(test_sequences)):
                x, y = test_sequences[i]
                y_true.append(y[0, 0])
                y_pred.append(model.predict(x)[0, 0])
            
            y_true = np.array(y_true)
            y_pred = np.array(y_pred)
            
            mae = mean_absolute_error(y_true, y_pred)
            mse = mean_squared_error(y_true, y_pred)
            rmse = np.sqrt(mse)
            
            print(f"Metrics for {ticker}:")
            print(f"  MAE: {mae}")
            print(f"  MSE: {mse}")
            print(f"  RMSE: {rmse}")
        else:
            print(f"Not enough test data for evaluation for ticker {ticker}")
        
        print(f"Predictions for {ticker}: {predictions.flatten()}")

        # Store results
        for i, pred in enumerate(predictions.flatten(), start=1):
            results.append([ticker, f'Q{i}', pred, mae, mse, rmse])

    except ValueError as e:
        print(e)
        print(f"Skipping ticker {ticker} due to insufficient data.")

# Create DataFrame from results
results_df = pd.DataFrame(results, columns=['Ticker', 'Quarter', 'Prediction', 'MAE', 'MSE', 'RMSE'])

# Save to CSV
results_df.to_csv('EBITDApredictions.csv', index=False)


In [None]:
#LSTM model for CapEx

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

# Load the data
df = pd.read_csv('historical_data_final.csv')

# Convert the Date column to datetime and sort by date
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(by='Date')

# Extract unique tickers
tickers = df['Ticker'].unique()

# Prepare data
def prepare_data(df, ticker, cutoff_date):
    df_ticker = df[df['Ticker'] == ticker]
    df_ticker = df_ticker[['Date', 'CapEx']]
    
    # Split the data
    train = df_ticker[df_ticker['Date'] < cutoff_date]
    test = df_ticker[df_ticker['Date'] >= cutoff_date]
    
    if len(train) < 3:
        raise ValueError(f"Not enough data points for ticker {ticker} to create sequences.")
    
    # Normalize the revenue
    scaler = MinMaxScaler(feature_range=(0, 1))
    train_scaled = scaler.fit_transform(train[['CapEx']])
    test_scaled = scaler.transform(test[['CapEx']])
    
    return train_scaled, test_scaled, scaler, train, test

# Define the cutoff date
cutoff_date = '2023-11-30'

# Create models for each ticker
models = {}
results = []

for ticker in tickers:
    try:
        train_scaled, test_scaled, scaler, train, test = prepare_data(df, ticker, cutoff_date)
        
        # Determine sequence length based on available data
        sequence_length = min(2, len(train_scaled) - 1)
        
        # Create sequences for LSTM
        generator = TimeseriesGenerator(train_scaled, train_scaled, length=sequence_length, batch_size=1)
        
        # Build the model
        model = Sequential([
            LSTM(50, activation='relu', input_shape=(sequence_length, 1)),
            Dense(1)
        ])
        model.compile(optimizer='adam', loss='mean_squared_error')
        
        # Train the model
        model.fit(generator, epochs=10, verbose=1)
        
        # Save the model
        models[ticker] = model

        # Predict the next 5 quarters
        last_sequence = np.array([train_scaled[-sequence_length:]])
        predictions_scaled = []
        
        for _ in range(5):
            # Make prediction
            prediction = model.predict(last_sequence)[0][0]
            predictions_scaled.append(prediction)
            
            # Update last_sequence with new prediction
            new_sequence = np.array([[[prediction]]])
            last_sequence = np.concatenate((last_sequence[:, 1:, :], new_sequence), axis=1)

        # Inverse transform predictions
        predictions = scaler.inverse_transform(np.array(predictions_scaled).reshape(-1, 1))
        
        # Evaluate the model
        mae = mse = rmse = None
        if len(test) > sequence_length:
            test_sequences = TimeseriesGenerator(test_scaled, test_scaled, length=sequence_length, batch_size=1)
            y_true = []
            y_pred = []
            
            for i in range(len(test_sequences)):
                x, y = test_sequences[i]
                y_true.append(y[0, 0])
                y_pred.append(model.predict(x)[0, 0])
            
            y_true = np.array(y_true)
            y_pred = np.array(y_pred)
            
            mae = mean_absolute_error(y_true, y_pred)
            mse = mean_squared_error(y_true, y_pred)
            rmse = np.sqrt(mse)
            
            print(f"Metrics for {ticker}:")
            print(f"  MAE: {mae}")
            print(f"  MSE: {mse}")
            print(f"  RMSE: {rmse}")
        else:
            print(f"Not enough test data for evaluation for ticker {ticker}")
        
        print(f"Predictions for {ticker}: {predictions.flatten()}")

        # Store results
        for i, pred in enumerate(predictions.flatten(), start=1):
            results.append([ticker, f'Q{i}', pred, mae, mse, rmse])

    except ValueError as e:
        print(e)
        print(f"Skipping ticker {ticker} due to insufficient data.")

# Create DataFrame from results
results_df = pd.DataFrame(results, columns=['Ticker', 'Quarter', 'Prediction', 'MAE', 'MSE', 'RMSE'])

# Save to CSV
results_df.to_csv('CapExpredictions.csv', index=False)


In [None]:
#LSTM model for NWC

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

# Load the data
df = pd.read_csv('historical_data_final.csv')

# Convert the Date column to datetime and sort by date
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(by='Date')

# Extract unique tickers
tickers = df['Ticker'].unique()

# Prepare data
def prepare_data(df, ticker, cutoff_date):
    df_ticker = df[df['Ticker'] == ticker]
    df_ticker = df_ticker[['Date', 'NWC']]
    
    # Split the data
    train = df_ticker[df_ticker['Date'] < cutoff_date]
    test = df_ticker[df_ticker['Date'] >= cutoff_date]
    
    if len(train) < 3:
        raise ValueError(f"Not enough data points for ticker {ticker} to create sequences.")
    
    # Normalize the revenue
    scaler = MinMaxScaler(feature_range=(0, 1))
    train_scaled = scaler.fit_transform(train[['NWC']])
    test_scaled = scaler.transform(test[['NWC']])
    
    return train_scaled, test_scaled, scaler, train, test

# Define the cutoff date
cutoff_date = '2023-11-30'

# Create models for each ticker
models = {}
results = []

for ticker in tickers:
    try:
        train_scaled, test_scaled, scaler, train, test = prepare_data(df, ticker, cutoff_date)
        
        # Determine sequence length based on available data
        sequence_length = min(2, len(train_scaled) - 1)
        
        # Create sequences for LSTM
        generator = TimeseriesGenerator(train_scaled, train_scaled, length=sequence_length, batch_size=1)
        
        # Build the model
        model = Sequential([
            LSTM(50, activation='relu', input_shape=(sequence_length, 1)),
            Dense(1)
        ])
        model.compile(optimizer='adam', loss='mean_squared_error')
        
        # Train the model
        model.fit(generator, epochs=10, verbose=1)
        
        # Save the model
        models[ticker] = model

        # Predict the next 5 quarters
        last_sequence = np.array([train_scaled[-sequence_length:]])
        predictions_scaled = []
        
        for _ in range(5):
            # Make prediction
            prediction = model.predict(last_sequence)[0][0]
            predictions_scaled.append(prediction)
            
            # Update last_sequence with new prediction
            new_sequence = np.array([[[prediction]]])
            last_sequence = np.concatenate((last_sequence[:, 1:, :], new_sequence), axis=1)

        # Inverse transform predictions
        predictions = scaler.inverse_transform(np.array(predictions_scaled).reshape(-1, 1))
        
        # Evaluate the model
        mae = mse = rmse = None
        if len(test) > sequence_length:
            test_sequences = TimeseriesGenerator(test_scaled, test_scaled, length=sequence_length, batch_size=1)
            y_true = []
            y_pred = []
            
            for i in range(len(test_sequences)):
                x, y = test_sequences[i]
                y_true.append(y[0, 0])
                y_pred.append(model.predict(x)[0, 0])
            
            y_true = np.array(y_true)
            y_pred = np.array(y_pred)
            
            mae = mean_absolute_error(y_true, y_pred)
            mse = mean_squared_error(y_true, y_pred)
            rmse = np.sqrt(mse)
            
            print(f"Metrics for {ticker}:")
            print(f"  MAE: {mae}")
            print(f"  MSE: {mse}")
            print(f"  RMSE: {rmse}")
        else:
            print(f"Not enough test data for evaluation for ticker {ticker}")
        
        print(f"Predictions for {ticker}: {predictions.flatten()}")

        # Store results
        for i, pred in enumerate(predictions.flatten(), start=1):
            results.append([ticker, f'Q{i}', pred, mae, mse, rmse])

    except ValueError as e:
        print(e)
        print(f"Skipping ticker {ticker} due to insufficient data.")

# Create DataFrame from results
results_df = pd.DataFrame(results, columns=['Ticker', 'Quarter', 'Prediction', 'MAE', 'MSE', 'RMSE'])

# Save to CSV
results_df.to_csv('NWCpredictions.csv', index=False)
