# Stock market clustering using CUML and Kmeans.



Here is the [dataset](https://www.kaggle.com/rohitjain454/all-stocks-5yr)

In [118]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [119]:
import sys
!cp ../input/rapids/rapids.21.06 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

In [6]:
import cuml
import cudf

In [7]:
# Both import methods supported
from cuml import KMeans
from cuml.cluster import KMeans



In [8]:
import numpy as np
import os
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import pandas as pd
from pandas import read_csv

In [9]:
#Read the data
filename = '../input/all-stocks-5yr/all_stocks_5yr.csv'
stock = read_csv(filename)
print("***Structure of data with all its features***")
stock.head()

In [10]:

df = cudf.read_csv(filename)


In [11]:
df.tail(3)

In [12]:
df.isnull().sum()

In [13]:
df = df.dropna()

In [14]:
df.isnull().sum()

There is a positive correlation between the open and close amount

In [117]:
#Copy dataset from GPU memory to host memory and plot the dataset
df.to_pandas().plot(kind='scatter', x='open', y='close', cmap=('rainbow'), sharex=True)

In [98]:
x_train = df.drop(['date', 'Name'], axis = 1)

In [99]:
# Feature Scaling and convert dataset to gpu matrix
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
df3 = sc.fit_transform(x_train.as_gpu_matrix())

In [100]:
#pca to reduce the dimension of the data

from sklearn.decomposition import PCA 
pca = PCA(2)  
projected = pca.fit_transform(df3)
print(df3.shape)
print(projected.shape)

find k using elbow method

In [101]:
def kmean_score(nclust):
    km = KMeans(n_clusters=nclust)
    km.fit(projected)
    rss = -km.score(projected)
    return rss

In [91]:
scores = [kmean_score(i) for i in range(1, 8)]

In [92]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(range(1, 8), scores)
plt.xlabel('K')
plt.ylabel('RSS')
plt.title('RSS versus K')

In [108]:
#fit and predict model

from cuml.cluster import KMeans as KMeans

kmeans = KMeans(n_clusters=3, max_iter=300, init='k-means||')

kme = kmeans.fit_predict(projected)


In [109]:
#plot clusters


fig, ax = plt.subplots(figsize = (8, 6))

plt.scatter(projected[:, 0], projected[:, 1],
            c=kme, 
            edgecolor="none", 
            cmap=plt.cm.get_cmap("Spectral_r", 5),
            alpha=0.5)
        
plt.gca().spines["top"].set_visible(False)
plt.gca().spines["right"].set_visible(False)
plt.gca().spines["bottom"].set_visible(False)
plt.gca().spines["left"].set_visible(False)

plt.xticks(size=12)
plt.yticks(size=12)

plt.xlabel("Component 1", size = 14, labelpad=10)
plt.ylabel("Component 2", size = 14, labelpad=10)

plt.title('clusters', size=16)


plt.colorbar(ticks=[0, 1, 2, 3, 4]);

plt.show()
plt.savefig('cluster.png')

add k means label to the dataframe

In [114]:
results_df = df.copy()

results_df['predicted'] = kmeans.labels_

results_df.tail(1005)

In [115]:
results_df['predicted'].value_counts()

In [116]:
#evaluation metrics

from cuml.metrics import adjusted_rand_score

score = adjusted_rand_score(labels_true=results_df['predicted'], 
                            labels_pred=kmeans.labels_)


score

In [120]:
results_df[['Name', 'predicted']]