In [None]:
# pip install yfinance --upgrade

In [None]:
# Importing required modules
import yfinance as yf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from scipy.linalg import svd
from sklearn.datasets import load_iris
from scipy import linalg


## First Part of Project ##

Q1 & Q2-A&B

In [None]:
# Define the stock symbol and date range
symbol = 'INTC'  
start_date = '2010-01-01'
end_date = '2023-07-01'

# Download data from Yahoo Finance
Intel = yf.download(symbol, start=start_date, end=end_date)

# Save data to a CSV file
# file_path1 = 'Intel_data.csv'
# Intel.to_csv(file_path1, index=False)

# Forward fill missing values
Intel.fillna(method='ffill', inplace=True)

# Sample the data to a daily frequency
Intel = Intel.resample('D').last()

# Fill missing values with linear interpolation
Intel.interpolate(method='linear', inplace=True)

# Save the data to a CSV file
file_path1 = 'Intel_data.csv'
Intel.to_csv(file_path1, index=False)

# Print the datas
print(Intel)


In [None]:
# Define the stock symbol and date range
symbol = 'AMD'  
start_date = '2010-01-01'
end_date = '2023-07-01'

# Download data from Yahoo Finance
AMD = yf.download(symbol, start=start_date, end=end_date)

# Save data to a CSV file
# file_path2 = 'AMD_data.csv'
# AMD.to_csv(file_path2, index=False)

# Forward fill missing values
AMD.fillna(method='ffill', inplace=True)

# Sample the data to a daily frequency
AMD = AMD.resample('D').last()

# Fill missing values with linear interpolation
AMD.interpolate(method='linear', inplace=True)

# Save the data to a CSV file
file_path2 = 'AMD_data.csv'
AMD.to_csv(file_path2, index=False)

# Print the data
print(AMD)


Q2-B Continue

In [None]:
#finding missing values qty
missing_values_Intel = Intel.isnull().sum()
missing_values_AMD = AMD.isnull().sum()
print(missing_values_Intel)
print('\n')
print(missing_values_AMD)


Q2-C

In [None]:
def normalize_data(Data):

    normalized_data = (Data - np.mean(Data, axis=0)) / np.std(Data, axis=0)

    # Create a new DataFrame with the normalized values
    normalized_data = pd.DataFrame(normalized_data, columns=Data.columns)

    return normalized_data


In [None]:
# Call the normalize_csv function
norm_data_Intel = normalize_data(Intel)
norm_data_AMD = normalize_data(AMD)

# Print the normalized data
print('Intel Normalized')
print(norm_data_Intel)
print('\n')
print('AMD Normalized')
print(norm_data_AMD)

# norm_data_Intel.describe()
# norm_data_AMD.describe()


Q2-D

In [None]:
# Perform stationarity transformation (differencing) for close prices
Close_diff_Intel = norm_data_Intel.diff()
Close_diff_Intel.dropna(inplace=True)

print(Close_diff_Intel)

print('\n')

# Perform stationarity transformation (differencing) for close prices
Close_diff_AMD = norm_data_AMD.diff()
Close_diff_AMD.dropna(inplace=True)

print(Close_diff_AMD)

# Seeing result on plot for close prices as an example
plt.style.use('bmh')
plt.gcf().autofmt_xdate()
plt.figure(figsize=(15,6), dpi=500)
Close_diff_Intel['Close'].plot(linewidth = 0.7)
plt.title('Close', fontsize=20, fontweight='bold')
plt.xlabel("Date", fontsize=12)
plt.ylabel("Price", fontsize=12)
plt.grid(color = 'white', linestyle = '--', linewidth = 0.8)

plt.figure(figsize=(15,6), dpi=500)
Close_diff_AMD['Close'].plot(linewidth = 0.7)
plt.title('Close', fontsize=20, fontweight='bold')
plt.xlabel("Date", fontsize=12)
plt.ylabel("Price", fontsize=12)
plt.grid(color = 'white', linestyle = '--', linewidth = 0.8)


Q3-A

In [None]:
column_names = norm_data_Intel.columns

# sns.set_theme()
plt.style.use('bmh')
plt.gcf().autofmt_xdate()

for column_name in column_names:
    plt.figure(figsize=(15,6), dpi=500)
    plt.plot(norm_data_Intel[column_name], linewidth = 0.7)
    plt.title(column_name, fontsize=20, fontweight='bold')
    plt.xlabel("Date", fontsize=12)
    plt.ylabel("Price", fontsize=12)
    plt.grid(color = 'white', linestyle = '--', linewidth = 0.8)

plt.show()


In [None]:
column_names = norm_data_AMD.columns

plt.style.use('bmh')
plt.gcf().autofmt_xdate()

for column_name in column_names:
    plt.figure(figsize=(15,6), dpi=500)
    plt.plot(norm_data_AMD[column_name], linewidth = 0.7)
    plt.title(column_name, fontsize=20, fontweight='bold')
    plt.xlabel("Date", fontsize=12)
    plt.ylabel("Price", fontsize=12)
    plt.grid(color = 'white', linestyle = '--', linewidth = 0.8)

plt.show()


Q3-B

Intel Share Stats

In [None]:
Intel['Year'] = Intel.index.year
# print(Intel)
pivot_table_intel = Intel.pivot_table(values=['Close','Adj Close','Volume'], index='Year', aggfunc=['mean','median','sum','max','min','std','var'])
pivot_table_intel.style.background_gradient(cmap='BuGn')

# pivot_table_intel


AMD Share Stats

In [None]:
AMD['Year'] = AMD.index.year
# print(AMD)

pivot_table_AMD = AMD.pivot_table(values=['Adj Close','Close','Volume'], index='Year', aggfunc=['mean','median','sum','max','min','std','var'])
pivot_table_AMD.title = "AMD Share"
pivot_table_AMD.style.bar(color='seagreen')

# pivot_table_AMD


Q3-C

In [None]:
# norm_data_Intel['Year'] = Intel.index.year
# norm_data_Intel = norm_data_Intel.drop('Year', axis=1)
column_names = norm_data_Intel.columns
name1=[]
name2=[]


for column_name1 in column_names:
    for column_name2 in column_names:
        if column_name1 != column_name2:
            if column_name1 not in name2 or column_name2 not in name1:
                
                name1.append(column_name1)
                name2.append(column_name2)
                correlation = norm_data_Intel[column_name1].corr(norm_data_Intel[column_name2])
                print(f'Correlation between {column_name1} and {column_name2}:', correlation)
    

In [None]:
# norm_data_AMD = norm_data_AMD.drop('Year', axis=1)
column_names = norm_data_AMD.columns
name1=[]
name2=[]


for column_name1 in column_names:
    for column_name2 in column_names:
        if column_name1 != column_name2:
            if column_name1 not in name2 or column_name2 not in name1:
                
                name1.append(column_name1)
                name2.append(column_name2)
                correlation = norm_data_AMD[column_name1].corr(norm_data_AMD[column_name2])
                print(f'Correlation between {column_name1} and {column_name2}:', correlation)
    

Q4-A-Part1

In [None]:
Moving_intel_weekly = norm_data_Intel.copy()
column_names = Moving_intel_weekly.columns

for column_name in column_names:
    
    Moving_intel_weekly[column_name] = Moving_intel_weekly[column_name].rolling(window=7).mean()

    
Moving_intel_weekly.dropna(inplace=True)    
Moving_intel_weekly


In [None]:
Moving_intel_seasonly = norm_data_Intel.copy()
column_names = Moving_intel_seasonly.columns

for column_name in column_names:
    
    Moving_intel_seasonly[column_name] = Moving_intel_seasonly[column_name].rolling(window=90).mean()


Moving_intel_seasonly.dropna(inplace=True)    
Moving_intel_seasonly


In [None]:
Moving_AMD_weekly = norm_data_AMD.copy()
column_names = Moving_AMD_weekly.columns

for column_name in column_names:
    
    Moving_AMD_weekly[column_name] = Moving_AMD_weekly[column_name].rolling(window=7).mean()

    
Moving_AMD_weekly.dropna(inplace=True)   
Moving_AMD_weekly


In [None]:
Moving_AMD_seasonly = norm_data_AMD.copy()
column_names = Moving_AMD_seasonly.columns

for column_name in column_names:
    
    Moving_AMD_seasonly[column_name] = Moving_AMD_seasonly[column_name].rolling(window=90).mean()
    
    
Moving_AMD_seasonly.dropna(inplace=True) 
Moving_AMD_seasonly


Q4-A-Part2

In [None]:
Moving_intel_monthly = norm_data_Intel.copy()
column_names = Moving_intel_monthly.columns


for column_name in column_names:
    
    plt.figure(figsize=(15, 6), dpi=500)
    Moving_intel_monthly[column_name] = Moving_intel_monthly[column_name].rolling(window=30).mean()
    plt.plot(norm_data_Intel.index, norm_data_Intel[column_name], label='Original', linewidth = 0.7)
    plt.plot(Moving_intel_monthly.index, Moving_intel_monthly[column_name], label='30-day Moving Average', linewidth = 0.9, color='orange')
    plt.title(f'Intel Share {column_name} with 30-day Moving Average', fontsize=20, fontweight='bold')
    plt.xlabel('Date', fontsize=12)
    plt.ylabel('Price', fontsize=12)
    plt.grid(color = 'white', linestyle = '--', linewidth = 0.8)
    plt.legend()
    plt.show()
    

In [None]:
Moving_AMD_monthly = norm_data_AMD.copy()
column_names = Moving_AMD_monthly.columns


for column_name in column_names:
    
    plt.figure(figsize=(15, 6), dpi=500)
    Moving_AMD_monthly[column_name] = Moving_AMD_monthly[column_name].rolling(window=30).mean()
    plt.plot(norm_data_AMD.index, norm_data_AMD[column_name], label='Original', linewidth = 0.7)
    plt.plot(Moving_AMD_monthly.index, Moving_AMD_monthly[column_name], label='30-day Moving Average', linewidth = 0.9, color='orange')
    plt.title(f'AMD Share {column_name} with 30-day Moving Average', fontsize=20, fontweight='bold')
    plt.xlabel('Date', fontsize=12)
    plt.ylabel('Price', fontsize=12)
    plt.grid(color = 'white', linestyle = '--', linewidth = 0.8)
    plt.legend()
    plt.show()
    

Q4-B

In [None]:
sample_Intel = norm_data_Intel.copy()
sample_Intel['Year'] = sample_Intel.index.year
sample_Intel['Month'] = sample_Intel.index.month

# Moving_sample_seasonly = normalize_data(sample_Intel)
column_names = sample_Intel.columns
# print(norm_data_Intel)
# print(column_names)


for column_name in column_names[0:-2]:
    
    trend = sample_Intel.groupby(['Year', 'Month'])[column_name].mean() 
    trend = trend.unstack()
    trend.plot(figsize=(25, 10))
    plt.title(f'Intel Share {column_name} with 90-day Moving Average', fontsize=20, fontweight='bold')
    plt.xlabel('Date', fontsize=15)
    plt.ylabel('Price', fontsize=15)
    plt.grid(color = 'white', linestyle = '--', linewidth = 0.8)
    plt.legend()
    plt.show()
    

In [None]:
sample_AMD = norm_data_AMD.copy()
sample_AMD['Year'] = sample_AMD.index.year
sample_AMD['Month'] = sample_AMD.index.month

# Moving_sample_seasonly1 = normalize_data(sample_AMD)
column_names = sample_AMD.columns
# print(norm_data_AMD)
# print(column_names)

for column_name in column_names[0:-2]:
    
    trend = sample_AMD.groupby(['Year', 'Month'])[column_name].mean() 
    trend = trend.unstack()
    trend.plot(figsize=(25, 10))
    plt.title(f'AMD Share {column_name} with 90-day Moving Average', fontsize=20, fontweight='bold')
    plt.xlabel('Date', fontsize=15)
    plt.ylabel('Price', fontsize=15)
    plt.grid(color = 'white', linestyle = '--', linewidth = 0.8)
    plt.legend()
    plt.show()
    

Q5-A

In [None]:
def normalize(Data):

    normalized = (Data - np.mean(Data, axis=0)) / np.std(Data, axis=0)

    # Create a new DataFrame with the normalized values
    normalized_data = pd.DataFrame(normalized, columns=Data.columns)

    return normalized_data


Q5-B

In [None]:
start_date = '2013-01-01'
end_date = '2023-07-01'


def get(import_list):
    
    for item in import_list:
                   
        print('\n')
        print(item)
        slot = yf.download(item, start=start_date, end=end_date)
        slot = pd.DataFrame(slot, columns=slot.columns)
        slot.fillna(method='ffill', inplace=True)

        # Sample the data to a daily frequency
        slot = slot.resample('D').last()

        # Fill missing values with linear interpolation
        slot.interpolate(method='linear', inplace=True)

        if slot['Volume'].sum() == 0:   
                slot = slot.drop('Volume', axis=1)
                print(slot)

        else:
                print(slot)

import_list = ['EURUSD=X', 'USDSAR=X', 'USDCNY=X', '^IRX', 'BTC-USD', 'GC=F', 'HG=F', 'ZW=F']
get(import_list)


In [None]:
start_date = '2013-01-01'
end_date = '2023-07-01'


def normalize(data):
    normalized = (data - np.mean(data, axis=0)) / np.std(data, axis=0)
    normalized_data = pd.DataFrame(normalized, columns=data.columns)
    
    return normalized_data

def apply_pca(datas, n_components):
    # Remove rows with missing values
    datas = datas.dropna()
    
    # Perform Singular Value Decomposition (SVD)
    U, S, V = svd(datas, full_matrices=False)
    
    # Select the top 'n_components' singular vectors
    reduced_data = np.dot(datas, V.T[:, :n_components])
    columns = [f"PC{i+1}" for i in range(n_components)]
    reduced_df = pd.DataFrame(reduced_data, columns=columns)
    
    return reduced_df

def PCA(import_list):
    for item in import_list:
        
            print('\n')
            print(item)
            slot = yf.download(item, start=start_date, end=end_date)
            slot = pd.DataFrame(slot, columns=slot.columns)
            slot.fillna(method='ffill', inplace=True)

            # Sample the data to a daily frequency
            slot = slot.resample('D').last()

            # Fill missing values with linear interpolation
            slot.interpolate(method='linear', inplace=True)

            if slot['Volume'].sum() == 0:   
                slot = slot.drop('Volume', axis=1)
                    
            # iris = load_iris()
            # data = pd.DataFrame(data=iris.data, columns=iris.feature_names)
            normalized_data = normalize(slot)

            # Apply PCA using Truncated SVD to decrease dimensions
            n_components = 2  # Number of dimensions/components after PCA
            reduced_data = apply_pca(normalized_data, n_components)

            # Print the reduced data
            print(f'PCA for {item}')
            print(reduced_data)


import_list = ['EURUSD=X', 'USDSAR=X', 'USDCNY=X', '^IRX', 'BTC-USD', 'GC=F', 'HG=F', 'ZW=F']
PCA(import_list)


Q5-C

In [None]:
start_date = '2013-01-01'
end_date = '2023-07-01'


def normalize(data):
    normalized = (data - np.mean(data, axis=0)) / np.std(data, axis=0)
    normalized_data = pd.DataFrame(normalized, columns=data.columns)
    
    return normalized_data

def apply_pca(datas, n_components):
    # Remove rows with missing values
    datas = datas.dropna()
    
    # Perform Singular Value Decomposition (SVD)
    U, S, V = svd(datas, full_matrices=False)
    
    # Select the top 'n_components' singular vectors
    reduced_data = np.dot(datas, V.T[:, :n_components])
    
    explained_variance_ratio = (S ** 2) / np.sum(S ** 2)
    # explained_variance_ratio = (S[0] ** 2) / np.sum(S ** 2)        
    columns = [f"PC{i+1}" for i in range(n_components)]
    reduced_df = pd.DataFrame(reduced_data, columns=columns)
    
    return explained_variance_ratio

def PCA(import_list):
    for item in import_list:
        
            print('\n')
            print(item)
            slot = yf.download(item, start=start_date, end=end_date)
            slot = pd.DataFrame(slot, columns=slot.columns)
            slot.fillna(method='ffill', inplace=True)

            # Sample the data to a daily frequency
            slot = slot.resample('D').last()

            # Fill missing values with linear interpolation
            slot.interpolate(method='linear', inplace=True)

            if slot['Volume'].sum() == 0:   
                    slot = slot.drop('Volume', axis=1)
                    
            # iris = load_iris()
            # data = pd.DataFrame(data=iris.data, columns=iris.feature_names)
            normalized_data = normalize(slot)

            # Apply PCA using Truncated SVD to decrease dimensions
            n_components = 2  # Number of dimensions/components after PCA
            explained_variance_ratio = apply_pca(normalized_data, n_components)

            print(f"Explained Variance Ratio for {item}:")
            explained_variance_str = ", ".join([f"{ratio:.3f}" for ratio in explained_variance_ratio])
            print(f"PCA: {explained_variance_str}")



import_list = ['EURUSD=X', 'USDSAR=X', 'USDCNY=X', '^IRX', 'BTC-USD', 'GC=F', 'HG=F', 'ZW=F']
PCA(import_list)


In [None]:
start_date = '2013-01-01'
end_date = '2023-07-01'

def normalize(data):
    normalized = (data - np.mean(data, axis=0)) / np.std(data, axis=0)
    normalized_data = pd.DataFrame(normalized, columns=data.columns)
    return normalized_data

def apply_pca(datas, n_components):
    # Remove rows with missing values
    datas = datas.dropna()
    
    # Perform Singular Value Decomposition (SVD)
    U, S, V = svd(datas, full_matrices=False)
    
    # Select the top 'n_components' singular vectors
    reduced_data = np.dot(datas, V.T[:, :n_components])
    
    explained_variance_ratio = (S ** 2) / np.sum(S ** 2)
    
    columns = [f"PC{i+1}" for i in range(n_components)]
    reduced_df = pd.DataFrame(reduced_data, columns=columns)
    
    return explained_variance_ratio, V


def PCA(import_list):
    for item in import_list:
        
        print('\n')
        print(item)
        slot = yf.download(item, start=start_date, end=end_date)
        slot = pd.DataFrame(slot, columns=slot.columns)
        slot.fillna(method='ffill', inplace=True)

        # Sample the data to a daily frequency
        slot = slot.resample('D').last()

        # Fill missing values with linear interpolation
        slot.interpolate(method='linear', inplace=True)

        if slot['Volume'].sum() == 0:
            slot = slot.drop('Volume', axis=1)

        normalized_data = normalize(slot)

        # Apply PCA using Truncated SVD to decrease dimensions
        n_components = 2  # Number of dimensions/components after PCA
        explained_variance_ratio, principal_components = apply_pca(normalized_data, n_components)

        print(f"Explained Variance Ratio for {item}:")
        explained_variance_str = ", ".join([f"{ratio:.3f}" for ratio in explained_variance_ratio])
        print(f"PCA1: {explained_variance_str}")

        # Create a scree plot
        plt.figure(figsize=(15, 6), dpi=500)
        plt.plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, marker='o', linewidth = 0.9)
        plt.xlabel('Principal Component', fontsize=12)
        plt.ylabel('Variance Explained Ratio', fontsize=12)
        plt.title(f"Scree Plot for {item}", fontsize=20, fontweight='bold')
        plt.grid(color = 'white', linestyle = '--', linewidth = 0.8)
        plt.show()

        # # Create a biplot
        # feature_names = normalized_data.columns
        # plt.figure(figsize=(15, 6), dpi=500)
        # plt.grid(color = 'white', linestyle = '--', linewidth = 0.8, alpha=0.8)
        
        # from itertools import cycle
        # cycol = cycle('bgrcmk')

        # color_legend = []
        # position = ['left', 'right', 'center', 'right', 'center', 'left']
        # j = 0
        
        # for i, feature in enumerate(feature_names):
            
        #     color = plt.plot(0, 0, color = next(cycol), alpha=0.5, linewidth=0.9, label=feature)
        #     color_legend.append(color[0])
        #     plt.arrow(0, 0, principal_components[0, i], principal_components[1, i], color=color[0].get_color(), alpha=0.5, linewidth=0.9)
        #     plt.text(principal_components[0, i]*1.1, principal_components[1, i]*1.1, feature, color=color[0].get_color(), rotation = 20, rotation_mode='anchor', ha = position[j])
        #     # plt.text(principal_components[0, i]*1.1, principal_components[1, i]*1.1, feature, color=color[0].get_color())
        #     j += 1
        
        # plt.legend(handles=color_legend)
            
        # plt.xlim(-1.25, 1.25)
        # plt.ylim(-1.25, 1.25)
        # plt.xlabel('PC1', fontsize=12)
        # plt.ylabel('PC2', fontsize=12)
        # plt.title(f"Biplot for {item}", fontsize=20, fontweight='bold')
        # plt.grid(color = 'white', linestyle = '--', linewidth = 0.8, alpha=0.5)
        # plt.show()
        


import_list = ['EURUSD=X', 'USDSAR=X', 'USDCNY=X', '^IRX', 'BTC-USD', 'GC=F', 'HG=F', 'ZW=F']
PCA(import_list)


In [None]:
# Define the stock symbols and date range
import_list = ['EURUSD=X', 'USDSAR=X', 'USDCNY=X', '^IRX', 'BTC-USD', 'GC=F', 'HG=F', 'ZW=F']
start_date = '2013-01-01'
end_date = '2023-07-01'

def normalize(data):
    normalized = (data - np.mean(data, axis=0)) / np.std(data, axis=0)
    return normalized

def apply_pca(datas, n_components):
    datas_centered = datas - np.mean(datas, axis=0)
    covariance_matrix = np.cov(datas_centered.T)
    eigenvalues, eigenvectors = linalg.eig(covariance_matrix)
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvalues_sorted = eigenvalues[sorted_indices]
    eigenvectors_sorted = eigenvectors[:, sorted_indices]
    selected_eigenvectors = eigenvectors_sorted[:, :n_components]
    transformed_data = np.dot(datas_centered, selected_eigenvectors)
    return transformed_data, selected_eigenvectors, eigenvalues_sorted

for item in import_list:
    print('\n')
    print(item)
    
    # Download data from Yahoo Finance
    slot = yf.download(item, start=start_date, end=end_date)
    slot = pd.DataFrame(slot, columns=slot.columns)
    slot.fillna(method='ffill', inplace=True)

    # Sample the data to a daily frequency
    slot = slot.resample('D').last()

    # Fill missing values with linear interpolation
    slot.interpolate(method='linear', inplace=True)
    
    if slot['Volume'].sum() == 0:
        slot = slot.drop('Volume', axis=1)

    # Normalize the data
    normalized_data = normalize(slot.values)

    # Apply PCA
    num_components = 2  # Number of components after PCA
    transformed_data, eigenvectors, eigenvalues = apply_pca(normalized_data, num_components)

    # Plot the PCA transformed data
    fig, ax = plt.subplots(figsize=(28, 15))
    # plt.figure(, dpi=500)
    sc = ax.scatter(transformed_data[:, 0], transformed_data[:, 1], c=np.arange(len(transformed_data)), cmap='viridis')
    plt.xlabel('Principal Component 1', fontsize=10)
    plt.ylabel('Principal Component 2', fontsize=10)
    plt.title(f'PCA - {item}', fontsize=17, fontweight='bold')
    
    # Plot the eigenvectors as arrows with eigenvalue labels
    origin = np.mean(transformed_data, axis=0)
    
    for i in range(num_components):
        
        ax.arrow(origin[0], origin[1], eigenvectors[0, i], eigenvectors[1, i], color='r',
                  width=0.1, head_width=0.3)
        ax.text(origin[0] + eigenvectors[0, i], origin[1] + eigenvectors[1, i],
                f"λ{i+1} = {eigenvalues[i]:.2f}", fontsize=12)
    
    # Add a colorbar
    cbar = plt.colorbar(sc)
    cbar.set_label('Data Point Index')
    plt.grid(color = 'white', linestyle = '--', linewidth = 0.8, alpha=0.8)
    plt.show()
