Stocks Data Analysis and Visualization in Python

In [None]:
!pip install yfinance
import yfinance as yf
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import seaborn as sns
from copy import copy
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go

In [None]:
stocks_df = pd.read_csv('stock.csv')
print(stocks_df)

# Number of stocks
print('Total Number of stocks: {}'.format(len(stocks_df.columns[1:])))

# Sort the stock data by data
stock = stocks_df.sort_values(by = ['Date'])
print(stock)

# Name of stocks
print('Stocks under consideration are:')
for i in stocks_df.columns[1:]:
  print(i)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Average return os SP500
# Stock/index has the minimum dispertion from the mean in dollar value
# The maximum price for AMZN stock over the specified time period
print(stocks_df.mean(),
      stocks_df.std(),
      stocks_df.describe())

In [None]:
# Checking data for nulls values
print(stocks_df.isnull().sum())

# Getting dataframe info
print(stocks_df.info())

# Ploting data 
def show_plot(df, fig_title):
  df.plot(x = 'Date', figsize = (15, 7), linewidth = 3, title = fig_title)
  plt.grid()
  plt.show()

show_plot(stocks_df, 'Raw Stock Prices (Without Normalization)')

def normalize(df):
  x = df.copy()
  for i in x.columns[1:]:
    x[i] = x[i]/x[i][0]
  return x

normalize(stocks_df)
show_plot(normalize(stocks_df), 'Normalized Stock Prices')

In [None]:
def interactive_plot(df, title):
  fig = px.line(title = title)
  for i in df.columns[1:]:
    fig.add_scatter(x = df['Date'] , y = df[i], name = i)
  fig.show()

interactive_plot(stocks_df, 'Prices')
interactive_plot(normalize(stocks_df), 'Normalize Prices')

In [None]:
df = stocks_df['sp500']
print(df)

df_daily_return = df.copy()
for i in range(1, len(df)):
  df_daily_return[i] = ((df[i] - df[i-1]) / df[i-1]) * 100

df_daily_return[0] = 0
print(df_daily_return)

In [None]:
# Define a function to calculate stock daily returns (for all stocks)
def daily_return(df):
  df_daily_return = df.copy()

  for i in df.columns[1:]:
    for j in range(1, len(df)):
      df_daily_return[i][j] = ((df[i][j] - df[i][j-1])/df[i][j-1]) * 100
    df_daily_return[i][0] = 0

  return df_daily_return

stocks_daily_return = daily_return(stocks_df)
print(stocks_daily_return,
      show_plot(stocks_daily_return, 'Stocks Daily Returns'),
      interactive_plot(stocks_daily_return, 'Stocks Daily Return'))

In [None]:
# Daily Return Correlation
cm = stocks_daily_return.drop(columns = ['Date']).corr()
plt.figure(figsize = (10,10))
sns.heatmap(cm, annot = True)

In [None]:
# Histogram of daily return
# Stock returns are normally distributed with zero mean
# Notice how tesla Standard deviation is high indicating a more volatile stock
stocks_daily_return.hist(figsize = (10,10), bins = 40)

df_hist = stocks_daily_return.copy()
df_hist = df_hist.drop(columns = ['Date'])
data = []

for i in df_hist.columns:
  data.append(stocks_daily_return[i].values)

print(data)

fig = ff.create_distplot(data, df_hist.columns)
fig.show()

Asset Allocation and Statistical Data Analysis

In [None]:
stocks_df = stocks_df.sort_values(by = ['Date'])
print(stocks_df)

def normalize(df):
  x = df.copy()
  for i in x.columns[1:]:
    x[i] = x[i][0] / x[i][0]
  return x

def interactive_plot(df, title):
  fig = px.line(title = title)
  for i in df.columns[1:]:
    fig.add_scatter(x = df['Date'], y = df[i], name = i)
  fig.show()

# plot raw data
print(interactive_plot(stocks_df, 'Prices'))
print(interactive_plot(normalize(stocks_df), 'Normalized Prices'))

In [None]:
# Create random portfolio weights
np.random.seed(101)
weights = np.array(np.random.random(9))
weights = weights/np.sum(weights)
print(weights)

df_portfolio = normalize(stocks_df)
print(df_portfolio)
print(df_portfolio.columns[1:])

for counter, stock in enumerate(df_portfolio.columns[1:]):
  df_portfolio[stock] = df_portfolio[stock] * weights[counter]
  df_portfolio[stock] = df_portfolio[stock] * 1000000

df_portfolio

df_portfolio['portfolio daily worth in $'] = df_portfolio[df_portfolio != 'Date'].sum(axis = 1)
print(df_portfolio)

df_portfolio['portfolio daily % return'] = 0.0000

for i in range(1, len(stocks_df)):
  df_portfolio['portfolio daily % return'][i] = ((df_portfolio['portfolio daily worth in $'][i]) - (df_portfolio['portfolio daily worth in $'][i-1]))

print(df_portfolio)

In [None]:
def portfolio_allocation(df, weights):
  df_portfolio = df.copy()
  df_portfolio = normalize(df_portfolio)

  for counter, stock in enumerate(df_portfolio.columns[1:]):
    df_portfolio[stock] = df_portfolio[stock] * weights[counter]
    df_portfolio[stock] = df_portfolio[stock] * 1000000

  df_portfolio['portfolio daily worth in $'] = df_portfolio[df_portfolio != 'Date'].sum(axis = 1)
  df_portfolio['portfolio daily % return'] = 0.0000

  for i in range(1, len(stocks_df)):
    df_portfolio['portfolio daily % return'][i] = ((df_portfolio['portfolio daily worth in $'][i]) - (df_portfolio['portfolio daily worth in $'][i-1]))

  df_portfolio['portfolio daily % return'][0] = 0
  return df_portfolio

df_portfolio = portfolio_allocation(stocks_df, weights)
print(df_portfolio)

In [None]:
fig = px.line(x = df_portfolio['Date'] ,y = df_portfolio['portfolio daily % return'], title='Portfolio Daily % Return')
fig.show()

interactive_plot(df_portfolio.drop(['portfolio daily worth in $', 'portfolio daily % return'], axis = 1), 'Portfolio Individual stocks worth in $ overtime')

fig = px.histogram(df_portfolio, x = 'portfolio daily % return')
fig.show()

fig = px.line(x = df_portfolio['Date'], y = df_portfolio['portfolio daily worth in $'], title = 'Portfolio Overall Value in $')
fig.show()

Captial Asset Pricing Model(CAPM)

In [None]:
# Create a function calculate daily returns
def daily_returns(df):
  df_daily_return = df.copy()
  for i in df.columns[1:]:
    for j in range(1, len(df)):
      df_daily_return[i][j] = ((df[i][j] - df[i][j-1]) / df[i][j-1]) * 100
    df_daily_return[i][0] = 0
  return df_daily_return

stocks_daily_return = daily_returns(stocks_df)
print(stocks_daily_return)

# Calculate Beta for a Single Stock
stocks_daily_return['AAPL']
stocks_daily_return['sp500']
stocks_daily_return.plot(kind = 'scatter', x = 'sp500', y = 'AAPL')

beta, alpha = np.polyfit(stocks_daily_return['sp500'], stocks_daily_return['AAPL'], 1)
print('Beta for {} stock is {} and alpha is {}'.format('AAPL', beta, alpha))

plt.plot(stocks_daily_return['sp500'], beta * stocks_daily_return['sp500'] + alpha, '-', color = 'r')
plt.show()

Predict Stocks Future Prices Using Machine and Deep Learning

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from tensorflow import keras

# Stock price
stocks_price = pd.read_csv('stock.csv')
print(stocks_price)

# Stock volume price
stock_vol = pd.read_csv('stock_volume.csv')
print(stock_vol)

In [None]:
# Function to noralize stock prices based on initial price
def normalize(df):
  x = df.copy()
  for i in x.columns[1:]:
    x[i] = x[i]/x[i][0]
  return x

# Function to plot interactive plots using Ploty Express
def interactive_plot(df, title):
  fig = px.line(title = title)
  for i in df.columns[1:]:
    fig.add_scatter(x = df['Date'], y = df[i], name = i)
  fig.show()

interactive_plot(stocks_price, 'Stock Prices')

# Ploting the volume dataset for all stocks
# Plot the normalized stock prices and volume dataset
interactive_plot(stock_vol, 'Stock Volume')
interactive_plot(normalize(stocks_price, 'Norm Stock Volume'))

In [None]:
from sklearn.preprocessing import MinMaxScaler
# Function to concretenate the data, stock price and volume in one dataframe
def individual_stock(price_df, vol_df, name):
  return pd.DataFrame({'Date': price_df['Date'], ['Close']: price_df[name],'Volume': vol_df[name] })

# Function to return the input/output (target) data for AI/ML Model
def trading_window(data):
  n = 1
  data['Target'] = data[['Close']].shift(-n)
  return data

price_vol = individual_stock(stocks_price, stock_vol, 'AAPL')
print(price_vol)

price_target = trading_window(price_vol)
print(price_target)

price_target = price_target[1:]
print(price_target)

sc = MinMaxScaler(feature_range = (0,1))
price_target_sd = sc.fit_transform(price_target.drop(columns = ['Date']))
print(price_target_sd)

# Creating a feature and target
x = price_target_sd[:, :2]
y = price_target_sd[:, 2:]

# Define a data plotting function
def show_plot(data, title):
  plt.figure(figsize = (13, 5))
  plt.plot(data, linewidth = 3)
  plt.title(title)
  plt.grid()

In [None]:
from sklearn.linear_model import Ridge
# Create and train the ridge linear regression model
split = int(0.65 * len(x))
xTest = x[split:]
yTest = y[split:]
xTrain = x[:split]
yTrain = y[:split]

reg_model = Ridge(alpha = 2)
reg_model.fit(xTrain, yTrain)

# test the model and calculate its accuracy
lr_acc = reg_model.score(xTest, yTest)
print('Rdige Regression Score:', lr_acc)

# Make Prediction
pred_price = reg_model.predict(x)
print(pred_price)

# Append the predicted values into a list
predicted = []
for i in pred_price:
  predicted.append(i[0])
len(predicted)

# Append the close values into a list
close = []
for i in price_target_sd:
  close.append(i[0])

# Create a datframe on the dates in the individual stock data
df_pred = price_target[['Date']]
print(df_pred)

# Add the create values to the dataframe
df_pred['Close'] = close
print(df_pred)

# Add the predicted values to the dataframe 
df_pred['Prediction'] = predicted
print(df_pred)
 # Plot the results
interactive_plot(df_pred, 'Original vs Predictions')

In [None]:
from sklearn.preprocessing import MinMaxScaler
# Train an LSTM Time Series Model
price_vdf = individual_stock(stocks_price, stock_vol, 'sp500')
print(price_vdf)

train_data = price_vdf.iloc[:, :3].values
print(train_data)

sc = MinMaxScaler(feature_range = (0,1))
training_set = sc.fit_tranform(train_data)

# Creating the training and testing data
x = []
y = []
for i in range(1, len(price_vdf)):
  x.append(training_set[i:1:i, 0])
  y.append(training_set[i:0])

# Convert the data into array format
x = np.asarray(x)
y = np.asarray(y)

# Split the data
split = int(0.7 * len(x))
xTest = x[split:]
yTest = y[split:]
xTrain = x[:split]
yTrain = y[:split]

# Reshape the 1D arrays to feed in the model
xTrain = np.reshape(xTrain, (xTrain.shape[0], xTrain.shape[1], 1))
xTest = np.reshape(xTest, (xTest.shape[0], xTest.shape[1], 1))
print(xTrain.shape, xTest.shape)

# Create and train the model
input = keras.layers.Input(shape = xTrain.shape[1], xTrain.shape[2])
x = keras.layers.LSTM(150, return_seguences = True)(input)
x = keras.layers.LSTM(150, return_seguences = True)(input)
x = keras.layers.LSTM(150, return_seguences = True)(x)

output = keras.layers.Dense(1, activation = 'linear')(x)
model = keras.Model(inputs = input , outputs = output)
model.complir(optimizer = 'adam', loss = 'mse')
model.summary()

history = model.fit(xTrain, yTrain, epochs = 2, batch_size = 32, validation_split = 0.2)

# Make prediction and append to a list
pred = model.predict(x)
test_pred = []
for i in pred:
  test_pred.append(i[0])

df_pred = price_vdf[1:]['Date', 'Close']
print(df_pred)

df_pred['predictions'] = test_pred
print(df_pred)

for i in pred:
  test_pred.append(i[0][0])

print(test_pred)

close = []
for i in training_set:
  close.append(i[0])

print(close)

df_pred['Close'] = close[1:]
print(df_pred)

interactive_plot(df_pred, 'Original Price vs LSTM Predictions')

Perfom Bank Market Segmentation Using Unsupervised Machine Learning Techniques

In [None]:
creditcard = pd.read_csv('/content/4.+Marketing_data.csv')
print(creditcard, creditcard.info(), creditcard.describe())

# Purchases of $40761
creditcard[creditcard['ONEOFF_PURCHASES'] == 40761.250000]

# Never paid credit card in full
creditcard[creditcard['CASH_ADVANCE'] == 47137.211760000006]

In [None]:
# Finding any missing data
sns.heatmap(creditcard.isnull(), yticklabels = False, char = False, cmap = 'Blues')
creditcard.isnull().sum()

creditcard.loc[(creditcard['MINIMUM_PAYMENTS'].isnull() == True), 'MINIMUM_PAYMENTS'] = creditcard['MINIMUM_PAYMENTS'].mean()
creditcard.isnull().sum()

# Fill up missing elements with mean of the credit limit
creditcard.loc[(creditcard['CREDIT_LIMIT'].isnull() == True), 'CREDIT_LIMIT'] = creditcard['CREDIT_LIMIT'].mean()
creditcard.isnull().sum()

# Find a deplicated entries in the data
creditcard.duplicated().sum()

# Dropping customer IDs
creditcard.drop('CUST_ID', axis = 1, inplace = True)
creditcard.head()

n = len(creditcard.columns)
print(n)

plt.figure(figsize = (10,50))
for i in range(len(creditcard.columns)):
  plt.subplot(17, 1, i+1)
  sns.distplot(creditcard[creditcard.columns[i]], kde_kws ={'color':'b', '1w':3, 'label': 'KDE', hist_kws = {'color': 'g'}})
  plt.title(creditcard.columns[i])

plt.tight_layout()

# Trend between 'PUCHASES' and 'CREDIT_LIMIT' & 'PAYMENTS'
correlations = creditcard.corr()

f, ax = plt.subplot(figsize = (20,20))
sns.heatmap(correlations, annot = True)

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, normalize
# Find the optimal number of clusters using elbow method
scaler = StandardScaler()
creditcard_sd = scaler.fit_transform(creditcard)
print(creditcard_sd.shape)

score_1 = []
range_values = range(1,20)

for i in range_values:
  kmeans = KMeans(n_cluster = i)
  kmeans.fit(creditcard_sd)
  score_1.append(kmeans.inerta_)

plt.plot(score_1, 'bx-')
plt.title("Finding the right number of clusters")
plt.xlabel('Clusters')
plt.ylabel('Scores WCSS')
plt.show()

In [None]:
# Applying K-Means Method
kmeans = KMeans(8)
kmeans.fit(creditcard_sd)
labels = kmeans.labels_

print(labels)

kmeans.cluster_centers_.shape
cluster_centers = pd.DataFrame(data = kmeans.cluster_centers_, columns = [creditcard.columns])

# concatenate the clusters labels to our original dataframe
creditcard_clust = pd.concat([creditcard, pd.DataFrame({'cluster':labels})], axis = 1)
creditcard_clust.head()

for i in creditcard.columns:
  plt.figure(figsize = (35,5))
  for j in range(8):
    plt.subplot(1,8,j+1)
    cluster = creditcard_clust[creditcard['cluster'] == j]
    cluster[i].hist(bins = 20)
    plt.title('{} \nCluster {}'.format(i, j))
plt.show()

In [None]:
from sklearn.decomposition import PCA
# Obtain the principal Componets
pca = PCA(n_componets = 2)
princ_comp = pca.fit_transform(creditcard_sd)
print(princ_comp)

# Create a dataframe with the two componets
pca_df = pd.DataFrame(data = princ_comp, columns = ['pca1', 'pca2'])
pca_df.head()

# Concatenate the clusters labels to the dataframe 
pca_df = pd.concat([pca_df, pd.DataFrame({'cluster': labels})], axis = 1)
pca_df.head()

plt.figure(figsize = (10,10))
ax = sns.scatterplot(x = 'pcal', y = 'pca2', hue = 'cluster', data = pca_df, palette = ['red', 'green', 'blue', 'gray', 'pink', 'yellow', 'black'])

In [None]:
# Apply Autoencoders (Perform Dimensionlity Reduction using Autoencoders)
from tensorflow.keras.layers import Input, Add, Dense, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D, AveragePooling2D, MaxPooling2D, Dropout
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.initializers import glorot_uniform
from keras.optimizers import SGD

input_df = Input(shape = (17, ))
x = Dense (7, activation = 'relu')(input_df)
x = Dense(500, activation = 'relu', kernal_inititalizer = 'glorot_uniform')(x)
x = Dense(500, activation = 'relu', kernal_inititalizer = 'glorot_uniform')(x)
x = Dense(2000, activation = 'relu', kernal_inititalizer = 'glorot_uniform')(x)

encoded = Dense (10, activation = 'relu', kernal_inititalizer = 'glorot_uniform')(x)

x = Dense(2000, activation = 'relu', kernal_inititalizer = 'glorot_uniform')(encoded)
x = Dense(2000, activation = 'relu', kernal_inititalizer = 'glorot_uniform')(x)

decoded = x = Dense(17, kernal_inititalizer = 'glorot_uniform')(x)

# Autocoder / Encoder
autoencoder = Model(input_df, decoded)
encoder = Model(input_df, encoded)

autoencoder.complie(optimizer = 'adam', loss = 'mean_square_error')
creditcard_sd.shape

autoencoder.fit(creditcard_sd, creditcard_sd, batch_size = 128, epochs = 25, verbose = 1)
autoencoder.summary()

In [None]:
pred = encoder.predict(ctreditcard_sd)
print(pred, pred.shape)

# optimal Number of clusters
# kmeans
# PCA
score_2 = []
range_values = range(1,20)

for i in range_values:
  kmeans = KMeans(n_cluster = i)
  kmeans.fit(pred)
  score_2.append(kmeans.inerta_)

plt.plot(score_2, 'bx-')
plt.title("Finding the right number of clusters")
plt.xlabel('Clusters')
plt.ylabel('Scores')
plt.show()

plt.plot(score_1, 'bx-', color = 'r')
plt.plot(score_2, 'bx-', color = 'g')

kmeans = KMeans(4)
kmeans.fit(pred)
labels = kmeans.labels_

df_cluster_dr = pd.concat([creditcard, pd.DataFrame({'cluster':labels})], axis = 1)
df_cluster_dr.head()

pca = PCA(n_componets = 2)
prin_comp = pca.fit_transform(pred)
pca_df = pd.DataFrame(data = prin_comp, columns = ['pca1', 'pca2'])
pca_df = pd.concat([pca_df, pd.DataFrame({'cluster':labels,}], axis = 1)
pca_df.head()

plt.figure(figsize = (10,10))
ax = sns.scatterplot(x = 'pca1', y = 'pca2', hue = 'cluster', data = pca_df, palette = ['red', 'green', 'blue', 'gray', 'pink', 'yellow', 'black'])