In [1]:
%load_ext nb_black
from collections import Counter
from umap import UMAP
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from factor_analyzer import FactorAnalyzer
from sklearn.cluster import KMeans, AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
from scipy.spatial.distance import pdist, squareform
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

<IPython.core.display.Javascript object>

In [2]:
# useful functions
def pca_loadings(pca):
    return pca.components_.T * np.sqrt(pca.explained_variance_)


def plot_dendrogram(model, **kwargs):
    """
    A function for plotting a dendrogram. Sourced from the following link:
    https://github.com/scikit-learn/scikit-learn/blob/70cf4a676caa2d2dad2e3f6e4478d64bcb0506f7/examples/cluster/plot_hierarchical_clustering_dendrogram.py
    
    Parameters:
        model (object of class sklearn.cluster.hierarchical.AgglomerativeClustering): a fitted scikit-learn hierarchical clustering model.
    
    Output: a dendrogram based on the model based in the parameters.
    
    Returns: None   
    """
    # Children of hierarchical clustering
    children = model.children_

    # Distances between each pair of children
    # Since we don't have this information, we can use a uniform one for plotting
    distance = np.arange(children.shape[0])

    # The number of observations contained in each cluster level
    no_of_observations = np.arange(2, children.shape[0] + 2)

    # Create linkage matrix and then plot the dendrogram
    linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(
        float
    )

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)


def hide_current_axis(*args, **kwds):
    plt.gca().set_visible(False)

<IPython.core.display.Javascript object>

In [3]:
df = pd.read_csv(r"C:\Users\jonat\Desktop\Stocks\prices.csv")
df.head()

Unnamed: 0,date,symbol,open,close,low,high,volume
0,2016-01-05 00:00:00,WLTW,123.43,125.839996,122.309998,126.25,2163600.0
1,2016-01-06 00:00:00,WLTW,125.239998,119.980003,119.940002,125.540001,2386400.0
2,2016-01-07 00:00:00,WLTW,116.379997,114.949997,114.93,119.739998,2489500.0
3,2016-01-08 00:00:00,WLTW,115.480003,116.620003,113.5,117.440002,2006300.0
4,2016-01-11 00:00:00,WLTW,117.010002,114.970001,114.089996,117.330002,1408600.0


<IPython.core.display.Javascript object>

In [4]:
# That is pretty... pretty cluttered. We should prob try to group stocks
# px.scatter(x="date", y="open", color="symbol", data_frame=df)

<IPython.core.display.Javascript object>

In [5]:
# Let's make date date-time and separate based on month, day, and year.
df["date"] = df["date"].str.replace("00:00:00", "")
df["date"] = pd.to_datetime(df["date"])
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["day"] = df["date"].dt.day
# now we drop date since it's kinda useless to us
df = df.drop(columns="date")

<IPython.core.display.Javascript object>

In [6]:
df

Unnamed: 0,symbol,open,close,low,high,volume,year,month,day
0,WLTW,123.430000,125.839996,122.309998,126.250000,2163600.0,2016,1,5
1,WLTW,125.239998,119.980003,119.940002,125.540001,2386400.0,2016,1,6
2,WLTW,116.379997,114.949997,114.930000,119.739998,2489500.0,2016,1,7
3,WLTW,115.480003,116.620003,113.500000,117.440002,2006300.0,2016,1,8
4,WLTW,117.010002,114.970001,114.089996,117.330002,1408600.0,2016,1,11
...,...,...,...,...,...,...,...,...,...
851259,ZBH,103.309998,103.199997,102.849998,103.930000,973800.0,2016,12,30
851260,ZION,43.070000,43.040001,42.689999,43.310001,1938100.0,2016,12,30
851261,ZTS,53.639999,53.529999,53.270000,53.740002,1701200.0,2016,12,30
851262,AIV,44.730000,45.450001,44.410000,45.590000,1380900.0,2016,12,30


<IPython.core.display.Javascript object>

In [7]:
# I don't want the date to be an object. Datetime is easier to work with for groupings
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 851264 entries, 0 to 851263
Data columns (total 9 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   symbol  851264 non-null  object 
 1   open    851264 non-null  float64
 2   close   851264 non-null  float64
 3   low     851264 non-null  float64
 4   high    851264 non-null  float64
 5   volume  851264 non-null  float64
 6   year    851264 non-null  int64  
 7   month   851264 non-null  int64  
 8   day     851264 non-null  int64  
dtypes: float64(5), int64(3), object(1)
memory usage: 58.5+ MB


<IPython.core.display.Javascript object>

In [8]:
# Are there any NA?
df.isna().mean()

symbol    0.0
open      0.0
close     0.0
low       0.0
high      0.0
volume    0.0
year      0.0
month     0.0
day       0.0
dtype: float64

<IPython.core.display.Javascript object>

In [9]:
low_stocks = df.symbol.value_counts()
Counter(low_stocks)

Counter({1762: 467,
         1761: 1,
         1683: 1,
         1540: 1,
         1008: 13,
         987: 1,
         896: 1,
         894: 1,
         892: 2,
         853: 1,
         786: 1,
         504: 4,
         385: 1,
         378: 2,
         304: 1,
         284: 1,
         251: 1,
         126: 1})

<IPython.core.display.Javascript object>

In [10]:
# I want to see how many full counts we have. Will look at full then set everything to minimum dates.
# For now, dropping those who don't have 1762
count = 0
for i in low_stocks:
    if i == 1762:
        count += 1
    else:
        count = count

print(count)

467


<IPython.core.display.Javascript object>

In [11]:
# keeping CHTR because it's only missing one
to_drop = low_stocks.tail(34)
pls_work = to_drop.reset_index()
drop = pls_work["index"].values
df["symbol"] = df["symbol"].replace(drop, "NaN")
df["symbol"]

0          NaN
1          NaN
2          NaN
3          NaN
4          NaN
          ... 
851259     ZBH
851260    ZION
851261     NaN
851262     AIV
851263     NaN
Name: symbol, Length: 851264, dtype: object

<IPython.core.display.Javascript object>

In [12]:
df_full = df[~df["symbol"].str.contains("NaN")]
df_full["symbol"].value_counts()

APD     1762
D       1762
ROP     1762
FLS     1762
AON     1762
        ... 
SRCL    1762
RTN     1762
MSI     1762
GRMN    1762
LB      1762
Name: symbol, Length: 467, dtype: int64

<IPython.core.display.Javascript object>

In [13]:
df_full

Unnamed: 0,symbol,open,close,low,high,volume,year,month,day
251,A,31.389999,31.300001,31.130000,31.630001,3815500.0,2010,1,4
252,AAL,4.840000,4.770000,4.660000,4.940000,9837300.0,2010,1,4
253,AAP,40.700001,40.380001,40.360001,41.040001,1701700.0,2010,1,4
254,AAPL,213.429998,214.009998,212.380001,214.499996,123432400.0,2010,1,4
255,ABC,26.290001,26.629999,26.139999,26.690001,2455900.0,2010,1,4
...,...,...,...,...,...,...,...,...,...
851257,YHOO,38.720001,38.669998,38.430000,39.000000,6431600.0,2016,12,30
851258,YUM,63.930000,63.330002,63.160000,63.939999,1887100.0,2016,12,30
851259,ZBH,103.309998,103.199997,102.849998,103.930000,973800.0,2016,12,30
851260,ZION,43.070000,43.040001,42.689999,43.310001,1938100.0,2016,12,30


<IPython.core.display.Javascript object>

In [15]:
df_full.columns

Index(['symbol', 'open', 'close', 'low', 'high', 'volume', 'year', 'month',
       'day'],
      dtype='object')

<IPython.core.display.Javascript object>

In [16]:
# df_full["label"] = number["label"]
# px.scatter(x="date", y="open", color="label", data_frame=df_full)

<IPython.core.display.Javascript object>

In [17]:
df_full["co_dif"] = df_full["close"] - df_full["open"]
df_full["hl_dif"] = df_full["high"] - df_full["low"]
df_full = df_full.reset_index().drop(columns="index")
# df_full

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


<IPython.core.display.Javascript object>

In [18]:
# There's decent relation between open, close, low, and high. Not too big of a shock
number = df_full.drop(columns="symbol")
# sns.pairplot(number)

<IPython.core.display.Javascript object>

In [19]:
# number

<IPython.core.display.Javascript object>

# We should probably get some more features based on amount change and change per day. These could be valuable to traders. If a stock consistantly has a big high low split, it'd be a good stock to day-trade/scalp. If the open is usually bigger than close, it'd be a good stock to hold longterm, etc.

In [20]:
scaler = StandardScaler()
scaled = scaler.fit_transform(number)
k_means = KMeans(n_clusters=6, random_state=69)
k_means.fit(scaled)
y_pred = k_means.predict(scaled)

<IPython.core.display.Javascript object>

In [21]:
columns = number.columns
centroids = scaler.inverse_transform(k_means.cluster_centers_)
centroids_df = pd.DataFrame(centroids, columns=number.columns)
centroids_df

Unnamed: 0,open,close,low,high,volume,year,month,day,co_dif,hl_dif
0,47.343245,47.364891,46.848297,47.838414,5152452.0,2011.618664,9.673676,15.79021,0.021646,0.990117
1,668.249481,668.119931,661.466763,674.545982,7626819.0,2013.678208,6.592173,15.732347,-0.12955,13.079219
2,47.091435,47.113269,46.613983,47.565536,5504463.0,2011.687505,3.335649,15.517443,0.021833,0.951553
3,57.129551,57.072445,56.228476,57.897231,117790900.0,2011.956101,6.237213,15.835481,-0.057106,1.668756
4,66.43791,66.454266,65.829629,67.038986,4070163.0,2015.030012,6.622313,15.793045,0.016357,1.209357
5,196.773724,196.836492,194.58639,198.931015,2599974.0,2013.978857,6.749978,15.667754,0.062769,4.344625


<IPython.core.display.Javascript object>

In [22]:
number["label"] = k_means.labels_
# sns.pairplot(number, hue="label")
# plt.show()

<IPython.core.display.Javascript object>

In [23]:
k_means_4 = KMeans(n_clusters=4, random_state=69)
k_means_4.fit(scaled)
y_pred = k_means_4.predict(scaled)
centroids_4 = scaler.inverse_transform(k_means_4.cluster_centers_)
number["label4"] = k_means_4.labels_

<IPython.core.display.Javascript object>

In [24]:
# # Now let's look at 4 to see if those two need to be in there
number4 = number.drop(columns="label")
number4.columns
# sns.pairplot(number4, hue="label4").map_upper(hide_current_axis)
# plt.show()

Index(['open', 'close', 'low', 'high', 'volume', 'year', 'month', 'day',
       'co_dif', 'hl_dif', 'label4'],
      dtype='object')

<IPython.core.display.Javascript object>

In [25]:
sampled = number.copy()
sample = sampled.sample(n=12345, random_state=69)
sample

Unnamed: 0,open,close,low,high,volume,year,month,day,co_dif,hl_dif,label,label4
356109,22.440001,22.360001,22.180000,22.719999,1201400.0,2013,1,14,-0.080000,0.539999,2,1
801647,60.529999,60.549999,60.110001,60.759998,1552000.0,2016,10,26,0.020000,0.649997,4,0
600505,40.709999,40.230000,40.049999,40.880001,2928400.0,2015,2,11,-0.479999,0.830002,4,0
506341,132.529999,128.699997,128.020004,133.080002,1224500.0,2014,4,25,-3.830002,5.059998,5,3
663406,45.720001,43.150002,43.009998,45.720001,4786500.0,2015,8,25,-2.569999,2.710003,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...
385512,18.240000,18.139999,17.910000,18.340000,5709500.0,2013,4,16,-0.100001,0.430000,2,1
343432,12.220000,12.220000,12.100000,12.380000,19057200.0,2012,12,4,0.000000,0.280000,0,1
549130,54.560001,54.160000,54.119999,54.680000,473400.0,2014,9,4,-0.400001,0.560001,4,0
752250,62.849998,62.669998,62.369999,63.360001,978500.0,2016,5,26,-0.180000,0.990002,4,0


<IPython.core.display.Javascript object>

In [26]:
# Trying hierarchical for comparison
dist_mat = squareform(pdist(sample, metric="euclid"))
clst = AgglomerativeClustering(n_clusters=5, linkage="ward", affinity="euclidean")
clst.fit(sample)

AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
                        connectivity=None, distance_threshold=None,
                        linkage='ward', memory=None, n_clusters=5)

<IPython.core.display.Javascript object>

In [27]:
# That's pretty useless imo
# plt.figure(figsize=(20, 10))
# plot_dendrogram(clst)

<IPython.core.display.Javascript object>

In [28]:
# Well... These clusters seem like they're not grouped well. Let's check value counts
sample["label"] = clst.labels_
sample.groupby("label").mean()

Unnamed: 0_level_0,open,close,low,high,volume,year,month,day,co_dif,hl_dif,label4
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,80.959999,80.327647,79.824705,81.830588,217605500.0,2012.0,5.294118,13.705882,-0.632352,2.005883,1.235294
1,85.68627,85.904206,84.911667,86.742699,76027100.0,2012.047619,6.753968,15.674603,0.217936,1.831031,1.071429
2,52.747389,52.75488,52.098039,53.373354,7984637.0,2012.75602,6.370177,15.612997,0.007491,1.275315,0.667537
3,37.781529,37.746588,37.214682,38.302164,29393420.0,2012.536471,5.962353,15.764706,-0.034941,1.087482,0.748235
4,79.523441,79.55833,78.782334,80.257003,1787606.0,2013.106723,6.608163,15.961945,0.034888,1.474669,0.720408


<IPython.core.display.Javascript object>

In [29]:
sample["label"].value_counts()

4    8330
2    3447
3     425
1     126
0      17
Name: label, dtype: int64

<IPython.core.display.Javascript object>

In [30]:
number["label"].value_counts()

4    283830
2    235817
0    232880
5     55816
1      9531
3      4980
Name: label, dtype: int64

<IPython.core.display.Javascript object>

In [31]:
number["label4"].value_counts()

1    384719
0    368167
3     60195
2      9773
Name: label4, dtype: int64

<IPython.core.display.Javascript object>

In [32]:
sample.groupby("label").mean()

Unnamed: 0_level_0,open,close,low,high,volume,year,month,day,co_dif,hl_dif,label4
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,80.959999,80.327647,79.824705,81.830588,217605500.0,2012.0,5.294118,13.705882,-0.632352,2.005883,1.235294
1,85.68627,85.904206,84.911667,86.742699,76027100.0,2012.047619,6.753968,15.674603,0.217936,1.831031,1.071429
2,52.747389,52.75488,52.098039,53.373354,7984637.0,2012.75602,6.370177,15.612997,0.007491,1.275315,0.667537
3,37.781529,37.746588,37.214682,38.302164,29393420.0,2012.536471,5.962353,15.764706,-0.034941,1.087482,0.748235
4,79.523441,79.55833,78.782334,80.257003,1787606.0,2013.106723,6.608163,15.961945,0.034888,1.474669,0.720408


<IPython.core.display.Javascript object>

In [33]:
number.groupby("label").mean()

Unnamed: 0_level_0,open,close,low,high,volume,year,month,day,co_dif,hl_dif,label4
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,47.345127,47.366777,46.850179,47.840323,5150625.0,2011.618769,9.673742,15.789926,0.021649,0.990144,0.812715
1,668.249481,668.119931,661.466763,674.545982,7626819.0,2013.678208,6.592173,15.732347,-0.12955,13.079219,2.0
2,47.092356,47.114194,46.614895,47.566464,5503233.0,2011.68752,3.335667,15.517456,0.021838,0.951569,0.835232
3,57.027689,56.969843,56.127177,57.793777,117635300.0,2011.956024,6.237349,15.830321,-0.057845,1.6666,1.09759
4,66.436585,66.45296,65.828334,67.037649,4069846.0,2015.030067,6.622119,15.793341,0.016375,1.209315,0.023838
5,196.768452,196.831169,194.581168,198.925613,2599921.0,2013.978949,6.750233,15.667819,0.062717,4.344445,2.997868


<IPython.core.display.Javascript object>

In [34]:
number.groupby("label4").mean()

Unnamed: 0_level_0,open,close,low,high,volume,year,month,day,co_dif,hl_dif,label
label4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,64.314257,64.335741,63.737975,64.891458,4162396.0,2014.666708,6.650889,15.788593,0.021483,1.153483,3.283654
1,43.875131,43.892635,43.396406,44.345747,6771744.0,2011.246286,6.421045,15.631799,0.017504,0.94934,1.041196
2,662.496977,662.35058,655.709169,668.776824,10059700.0,2013.659368,6.575668,15.738156,-0.146396,13.067655,1.073058
3,192.556726,192.616602,190.413142,194.66689,3655142.0,2013.922834,6.752172,15.646798,0.059876,4.253748,4.826265


<IPython.core.display.Javascript object>

# For clustering, PCA seems to be much much better for clustering and having groups

# Now let's check dimension reduction techniques to see if they are helpful at all for better distinguishing clusters

In [35]:
pca = PCA(n_components=6)
pca_df_full = pca.fit_transform(scaled)
loadings = pca_loadings(pca)
loadings = pd.DataFrame(loadings)
loadings.index = columns
loadings.columns = [f"component_{i}" for i in range(loadings.shape[1])]

<IPython.core.display.Javascript object>

In [36]:
pca5 = PCA(n_components=5)
pca5_df = pca5.fit_transform(scaled)
loadings5 = pca_loadings(pca5)
loadings5 = pd.DataFrame(loadings5)
loadings5.index = columns
loadings5.columns = [f"component_{i}" for i in range(loadings5.shape[1])]

<IPython.core.display.Javascript object>

In [37]:
# Interesting. Stocks seem to be more expensive at vertain parts of the year. The month can also cheapen a stock
loadings

Unnamed: 0,component_0,component_1,component_2,component_3,component_4,component_5
open,0.993322,0.018758,-0.000954,0.002098,-0.012343,-0.022728
close,0.993163,0.015289,-0.013385,0.000648,-0.023737,-0.021566
low,0.99243,0.015509,-0.007507,0.001136,-0.018557,-0.022027
high,0.993982,0.018206,-0.007092,0.001535,-0.018021,-0.022185
volume,-0.066847,0.781176,-0.036316,0.021095,-0.121912,0.605955
year,0.206211,-0.588689,-0.00253,-0.165417,0.272198,0.713057
month,0.018161,-0.21804,0.386848,0.76991,-0.429552,0.158582
day,0.00017,-0.057197,0.585912,-0.610525,-0.52943,0.017614
co_dif,-0.01376,-0.200327,-0.7176,-0.083681,-0.657671,0.067185
hl_dif,0.852792,0.126082,0.011532,0.017719,0.007887,-0.02414


<IPython.core.display.Javascript object>

In [38]:
loadings5

Unnamed: 0,component_0,component_1,component_2,component_3,component_4
open,0.993322,0.018758,-0.000954,0.002098,-0.012343
close,0.993163,0.015289,-0.013385,0.000648,-0.023737
low,0.99243,0.015509,-0.007507,0.001136,-0.018557
high,0.993982,0.018206,-0.007092,0.001535,-0.018021
volume,-0.066847,0.781176,-0.036316,0.021095,-0.121912
year,0.206211,-0.588689,-0.00253,-0.165417,0.272198
month,0.018161,-0.21804,0.386848,0.76991,-0.429552
day,0.00017,-0.057197,0.585912,-0.610525,-0.52943
co_dif,-0.01376,-0.200327,-0.7176,-0.083681,-0.657671
hl_dif,0.852792,0.126082,0.011532,0.017719,0.007887


<IPython.core.display.Javascript object>

# Let's check on UMAP to see if it does anything PCA doesn't

In [39]:
# Let's see if another method would be better for dimensionality reduction. UMAP TIME!
# if i put in all of scaled, it can't deal with it. Time to random sample
umap = UMAP(n_neighbors=6, min_dist=0.3, metric="euclidean")
feeling_like_will = umap.fit_transform(sample)

# feeling_like_will



<IPython.core.display.Javascript object>

In [40]:
feeling_like_will

array([[ 7.3293424 , 10.837978  ],
       [16.422075  , 10.868607  ],
       [ 0.84369963, 10.012195  ],
       ...,
       [ 8.348716  ,  3.730916  ],
       [-7.9088182 ,  7.6053987 ],
       [10.599534  , -2.4902887 ]], dtype=float32)

<IPython.core.display.Javascript object>

In [41]:
umap_df = pd.DataFrame(feeling_like_will, index=sample.index)
# px.scatter(umap_df, x=feeling_like_will[:, 0], y=feeling_like_will[:, 1])

<IPython.core.display.Javascript object>

In [42]:
umap = UMAP(n_neighbors=6, min_dist=0.6, metric="euclidean")
feeling_like_will = umap.fit_transform(sample)
umap_df = pd.DataFrame(feeling_like_will, index=sample.index)
# px.scatter(umap_df, x=feeling_like_will[:, 0], y=feeling_like_will[:, 1])



<IPython.core.display.Javascript object>

In [43]:
umap = UMAP(n_neighbors=5, min_dist=0.6, metric="euclidean")
feeling_like_will = umap.fit_transform(sample)
umap_df = pd.DataFrame(feeling_like_will, index=sample.index)
# px.scatter(umap_df, x=feeling_like_will[:, 0], y=feeling_like_will[:, 1])



<IPython.core.display.Javascript object>

# These don't really look like they add much information so I'm just going to look at the clustered data. It seems like 6 neighbors has better information for stock traders as I will explain below. As a reminder, here is the mean of that group and the scatterplot

In [44]:
number["label"].value_counts()

4    283830
2    235817
0    232880
5     55816
1      9531
3      4980
Name: label, dtype: int64

<IPython.core.display.Javascript object>

In [45]:
number.groupby("label").mean()

Unnamed: 0_level_0,open,close,low,high,volume,year,month,day,co_dif,hl_dif,label4
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,47.345127,47.366777,46.850179,47.840323,5150625.0,2011.618769,9.673742,15.789926,0.021649,0.990144,0.812715
1,668.249481,668.119931,661.466763,674.545982,7626819.0,2013.678208,6.592173,15.732347,-0.12955,13.079219,2.0
2,47.092356,47.114194,46.614895,47.566464,5503233.0,2011.68752,3.335667,15.517456,0.021838,0.951569,0.835232
3,57.027689,56.969843,56.127177,57.793777,117635300.0,2011.956024,6.237349,15.830321,-0.057845,1.6666,1.09759
4,66.436585,66.45296,65.828334,67.037649,4069846.0,2015.030067,6.622119,15.793341,0.016375,1.209315,0.023838
5,196.768452,196.831169,194.581168,198.925613,2599921.0,2013.978949,6.750233,15.667819,0.062717,4.344445,2.997868


<IPython.core.display.Javascript object>

In [46]:
number.columns

Index(['open', 'close', 'low', 'high', 'volume', 'year', 'month', 'day',
       'co_dif', 'hl_dif', 'label', 'label4'],
      dtype='object')

<IPython.core.display.Javascript object>

In [47]:
# number = number.drop(columns="label4")
# sns.pairplot(number, hue="label")
# plt.show()

<IPython.core.display.Javascript object>

In [48]:
df_full["label"] = number["label"]

<IPython.core.display.Javascript object>

In [49]:
df_full[df_full["label"] == 1]

Unnamed: 0,symbol,open,close,low,high,volume,year,month,day,co_dif,hl_dif,label
188,GOOG,626.951088,626.751061,624.241073,629.511067,3927000.0,2010,1,4,-0.200027,5.269994,1
189,GOOGL,626.950006,626.750011,624.240011,629.510005,3908400.0,2010,1,4,-0.199995,5.269994,1
655,GOOG,627.181073,623.991055,621.541045,627.841071,6031900.0,2010,1,5,-3.190018,6.300026,1
656,GOOGL,627.180001,623.990017,621.540016,627.839984,6003300.0,2010,1,5,-3.189984,6.299968,1
1122,GOOG,625.861078,608.261023,606.361042,625.861078,7987100.0,2010,1,6,-17.600055,19.500036,1
...,...,...,...,...,...,...,...,...,...,...,...,...
822436,AZO,800.349976,789.789978,787.190002,800.500000,246400.0,2016,12,30,-10.559998,13.309998,1
822575,GOOG,782.750000,771.820007,770.409973,782.780029,1760200.0,2016,12,30,-10.929993,12.370056,1
822576,GOOGL,803.210022,792.450012,789.619995,803.289978,1728300.0,2016,12,30,-10.760010,13.669983,1
822617,ISRG,638.320007,634.169983,632.250000,638.390015,267300.0,2016,12,30,-4.150024,6.140015,1


<IPython.core.display.Javascript object>

# Well some conclusions based on the above data:

# General: Groups 0 , and 2 seem to be similar in terms of open, close, low and high but differ on the difference in those. Ideally, it seems group 3 would be the one to day trade among all of these


- open: Group 3 seems to be expensive stocks and group 5 are cheaper
- close: look below to co_dif
- low: not too different from open
- high: look below to hl_dif
- volume: This is interesting, it seems most trading is done on expensive stocks(group 3) by far. Looking to hl_dif might explain
- year: This column has no real meaningful information. Could combine with GDP in second or third passthrough to see
- month: this column also has no real meaningful information. 0 and 2 seem to be early and later in the year
- day: this column also has no real meaningful information due to similarity
- hl_dif: This is interesting. The biggest difference on group 3 which could explain the reason for trading volume. Given the open amount of the stock, it makes sense there's a bigger dollar value would have a bigger high/low split. The second highest doesn't seem to be as expensive though. Depending on the trading fees, those would each be interesting to delve further into. 



In [50]:
# format is open close low high volume year month day co_dif hl_dif
chegg = [[62.34, 62.93, 61.67, 63.34, 2725077, 2020, 5, 20, -0.59, 1.67]]
apple = [[316.68, 319.16, 316.2, 319.52, 50980000, 2020, 5, 20, 2.48, 3.32]]
cars = [[1116.37, 1114.49, 1105.02, 1142.99, 289208, 2020, 5, 20, -1.88, 37.97]]
search = [[1389.16, 1409.18, 1365, 1420, 1864967, 2020, 5, 20, 20.02, 55]]
clust_1 = [
    [
        1483.489990,
        1466.060059,
        1462.339966,
        1483.489990,
        405100,
        2016,
        12,
        30,
        -17.429931,
        21.150024,
    ]
]
thinkful = k_means.predict(scaler.transform(chegg))
pomme = k_means.predict(scaler.transform(apple))
auto_zone = k_means.predict(scaler.transform(cars))
google = k_means.predict(scaler.transform(search))
we_know = k_means.predict(scaler.transform(clust_1))

<IPython.core.display.Javascript object>

In [51]:
print(thinkful)

[4]


<IPython.core.display.Javascript object>

In [52]:
print(pomme)

[5]


<IPython.core.display.Javascript object>

In [53]:
print(auto_zone)

[1]


<IPython.core.display.Javascript object>

In [54]:
print(google)

[1]


<IPython.core.display.Javascript object>

In [55]:
# making sure that the predictor is working
print(we_know)

[1]


<IPython.core.display.Javascript object>