In [None]:
import pandas as pd
import yfinance as yf
import os

# Define the index ticker symbols for broad market indices
broad_market_indices = ["^NSEI", "^NSEMDCP50"]

# Define the index ticker symbols for sectoral indices
sectoral_indices = [
    "^CNXAUTO", "NIFTY_FIN_SERVICE.NS", "^NSEBANK", "^CNXFMCG", "^CNXIT", "^CNXMEDIA", "^CNXMETAL", "^CNXPSUBANK", "^CNXREALTY", "^CNXENERGY"
]

# Function to download and save historical data
def download_data(indices, start_date, end_date):
    for index in indices:
        data = yf.download(index, start=start_date, end=end_date)
        data.to_csv(f"{index}.csv")
        print(f"Downloaded data for {index}")

# Download data for broad market indices
download_data(broad_market_indices, "2013-01-01", "2022-12-31")

# Download data for sectoral indices
download_data(sectoral_indices, "2013-01-01", "2022-12-31")


In [None]:
# Function to clean data and align dates across all files
def clean_and_align_data(indices):
    data_frames = {}
    for index in indices:
        df = pd.read_csv(f"{index}.csv", index_col="Date", parse_dates=True)
        df.dropna(inplace=True)
        data_frames[index] = df

    # Find the common date range across all DataFrames
    common_dates = set(data_frames[indices[0]].index)
    for index in indices:
        common_dates = common_dates.intersection(set(data_frames[index].index))

    common_dates = sorted(list(common_dates))

    # Align all DataFrames to the common date range
    for index in indices:
        data_frames[index] = data_frames[index].loc[common_dates]
        data_frames[index].to_csv(f"{index}_cleaned.csv")
        print(f"Cleaned and aligned data for {index}")
        print(f"Size of {index}_cleaned.csv: {data_frames[index].shape[0]} rows, {data_frames[index].shape[1]} columns")

# Clean and align data for all indices
all_indices = broad_market_indices + sectoral_indices
clean_and_align_data(all_indices)

# Load macroeconomic data
macro_data = pd.read_csv('Macro_indicators.csv')
macro_data['date'] = pd.to_datetime(macro_data['date'], format='%d-%m-%Y')
macro_data.set_index('date', inplace=True)
macro_data.dropna(inplace=True)

# Resample macro_data to match the frequency of sectoral data if necessary
macro_data = macro_data.resample('D').ffill().dropna()

# Ensure both datasets have the same date range
common_dates = macro_data.index.intersection(pd.read_csv(f"{sectoral_indices[0]}_cleaned.csv", index_col="Date", parse_dates=True).index)
macro_data = macro_data.loc[common_dates]

macro_data.head()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Load cleaned sectoral data
sectoral_data = {index: pd.read_csv(f"{index}_cleaned.csv", index_col="Date", parse_dates=True) for index in sectoral_indices}

# Calculate correlation matrix
correlation_matrix = pd.DataFrame(index=sectoral_indices, columns=macro_data.columns)

for index in sectoral_indices:
    for col in macro_data.columns:
        try:
            correlation = sectoral_data[index].loc[common_dates, 'Close'].corr(macro_data.loc[common_dates, col])
            correlation_matrix.loc[index, col] = correlation
        except Exception as e:
            correlation_matrix.loc[index, col] = None

# Drop columns with all NaN values
correlation_matrix.dropna(how='all', axis=1, inplace=True)

# Plot correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix.astype(float), annot=True, cmap="coolwarm", vmin=-1, vmax=1)
plt.title("Correlation Matrix between Sectoral Indices and Macroeconomic Variables")
plt.show()

# Plot sectoral indices
plt.figure(figsize=(15, 10))
for index in sectoral_indices:
    plt.plot(sectoral_data[index]['Close'], label=index)
plt.title('Sectoral Indices Over Time')
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.legend()
plt.show()

# Plot macroeconomic variables
plt.figure(figsize=(15, 10))
for col in macro_data.columns:
    plt.plot(macro_data[col], label=col)
plt.title('Macroeconomic Variables Over Time')
plt.xlabel('Date')
plt.ylabel('Value')
plt.legend()
plt.show()


In [None]:
# Create individual plots for each macroeconomic variable
num_cols = len(macro_data.columns)
num_rows = (num_cols + 1) // 2  # Calculate the number of rows needed for subplots

fig, axes = plt.subplots(num_rows, 2, figsize=(15, 5*num_rows))
if num_rows == 1:
    axes = axes.reshape(1, -1)
    
# Plot each macroeconomic variable in a subplot
for i, col in enumerate(macro_data.columns):
    row_index = i // 2
    col_index = i % 2
    axes[row_index, col_index].plot(macro_data[col])
    axes[row_index, col_index].set_title(col + ' Over Time')
    axes[row_index, col_index].set_xlabel('Date')
    axes[row_index, col_index].set_ylabel('Value')
     # Add grid for better readability

# Adjust layout to prevent overlap
plt.tight_layout()
plt.show()

In [None]:
from statsmodels.tsa.arima.model import ARIMA
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Train and evaluate models for each sectoral index using various machine learning models
models = [RandomForestRegressor(n_estimators=100, random_state=42), LinearRegression(), SVR(), KNeighborsRegressor(), XGBRegressor()]
model_names = ["RandomForest", "LinearRegression", "SVR", "KNeighbors", "XGBRegressor"]
results = {}

# Load cleaned broad market data
broad_market_data = {index: pd.read_csv(f"{index}_cleaned.csv", index_col="Date", parse_dates=True) for index in broad_market_indices}

# Ensure both datasets have the same date range as macro_data
common_dates = macro_data.index.intersection(broad_market_data[broad_market_indices[0]].index)
macro_data = macro_data.loc[common_dates]

# Prepare feature matrix X
X = macro_data.values

# Function to train and evaluate model
def train_and_evaluate(index, data, model):
    y = data.loc[common_dates, 'Close'].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    train_rmse = mean_squared_error(y_train, y_pred_train, squared=False)
    test_rmse = mean_squared_error(y_test, y_pred_test, squared=False)
    train_mae = mean_absolute_error(y_train, y_pred_train)
    test_mae = mean_absolute_error(y_test, y_pred_test)
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)

    return model, {
        "Train RMSE": train_rmse,
        "Test RMSE": test_rmse,
        "Train MAE": train_mae,
        "Test MAE": test_mae,
        "Train R2": train_r2,
        "Test R2": test_r2
    }



# Align broad market data to the common date range
for index in broad_market_indices:
    broad_market_data[index] = broad_market_data[index].loc[common_dates]


for model, name in zip(models, model_names):
    results[name] = {}
    for index in sectoral_indices + broad_market_indices:
        results[name][index] = train_and_evaluate(index, sectoral_data[index] if index in sectoral_indices else broad_market_data[index], model)[1]

# Display performance results
for name in model_names:
    print(f"Performance of {name}:")
    for index in sectoral_indices + broad_market_indices:
        print(f"{index}: {results[name][index]}")
    print()



In [None]:
# Function to compare the performance of sectoral indices against NIFTY 50
def compare_performance(results, model_names, sectoral_indices, broad_market_index="^NSEI"):
    for name in model_names:
        print(f"Comparing performance using {name}:")
        nifty50_perf = results[name][broad_market_index]
        for index in sectoral_indices:
            sector_perf = results[name][index]
            if sector_perf["Test R2"] > nifty50_perf["Test R2"]:
                print(f"{index} outperforms {broad_market_index} with a Test R2 of {sector_perf['Test R2']} compared to {nifty50_perf['Test R2']}")
            else:
                print(f"{index} does not outperform {broad_market_index}")
        print()

# Compare performance of sectoral indices against NIFTY 50
compare_performance(results, model_names, sectoral_indices)

In [None]:
import json

# Save the results to a file
with open("model_performance.json", "w") as file:
    json.dump(results, file, indent=4)

# Function to display performance in a readable format
def display_performance(results):
    for model, performance in results.items():
        print(f"Performance of {model}:")
        for index, metrics in performance.items():
            print(f"{index}:")
            for metric, value in metrics.items():
                print(f"  {metric}: {value}")
        print()

# Display the performance
display_performance(results)
