In [1]:
# Import the required libraries and dependencies
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [2]:
# Read the CSV file into a pandas DataFrame
# Set the index using the Ticker column
df_stocks = pd.read_csv("Resources/stock_data.csv", index_col="Ticker")

# Review the DataFrame
df_stocks.head()

Unnamed: 0_level_0,Company_Name,MeanOpen,MeanHigh,MeanLow,MeanClose,MeanVolume,MeanPercentReturn,Sector
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AA,Alcoa,16.515833,16.95,16.039167,16.549167,142021800.0,0.181467,Industrials
AXP,American Express,44.6025,45.683333,43.615833,44.821667,38490820.0,0.401822,Financials
BA,Boeing,70.768333,72.375,69.235,71.258333,26141440.0,0.589636,Industrials
BAC,Bank of America,14.3025,14.66,13.8675,14.27,828393000.0,0.070159,Financials
CAT,Caterpillar,99.521667,101.933333,97.073333,100.364167,30933520.0,0.439549,Industrials


In [3]:
# Get the information on the data types
df_stocks.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30 entries, AA to XOM
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Company_Name       30 non-null     object 
 1   MeanOpen           30 non-null     float64
 2   MeanHigh           30 non-null     float64
 3   MeanLow            30 non-null     float64
 4   MeanClose          30 non-null     float64
 5   MeanVolume         30 non-null     float64
 6   MeanPercentReturn  30 non-null     float64
 7   Sector             30 non-null     object 
dtypes: float64(6), object(2)
memory usage: 2.1+ KB


## Preprocess and Scale the Data

In [4]:
# Use the StandardScaler module and fit_transform function to 
# scale all columns with numerical values
stock_data_scaled = StandardScaler().fit_transform(df_stocks[["MeanOpen", "MeanHigh",
                                                              "MeanLow", "MeanClose",
                                                              "MeanVolume", "MeanPercentReturn"]])

# Display the first five rows of the scaled data
stock_data_scaled[0:5]

array([[-1.15888638, -1.15321656, -1.15962398, -1.15341696,  0.10318422,
        -1.61091732],
       [-0.26115681, -0.25439297, -0.26573893, -0.25818533, -0.53919946,
        -0.91904186],
       [ 0.57517748,  0.58056409,  0.56469466,  0.57891575, -0.61582422,
        -0.32934221],
       [-1.2296308 , -1.22485134, -1.23001756, -1.2255854 ,  4.36194457,
        -1.96040239],
       [ 1.49421562,  1.50519497,  1.46706151,  1.50053439, -0.58609062,
        -0.80058637]])

In [5]:
# Create a DataFrame called with the scaled data
# The column names should match those referenced in the StandardScaler step
df_stocks_scaled = pd.DataFrame(
    stock_data_scaled,
    columns=["MeanOpen", "MeanHigh", "MeanLow", "MeanClose", "MeanVolume", "MeanPercentReturn"]
)

# Create a Ticker column in the df_stocks_scaled DataFrame
# using the index of the original df_stocks DataFrame
df_stocks_scaled["Ticker"] = df_stocks.index

# Set the newly created Ticker column as index of the df_stocks_scaled DataFrame
df_stocks_scaled = df_stocks_scaled.set_index("Ticker")

# Review the DataFrame
df_stocks_scaled.head()

Unnamed: 0_level_0,MeanOpen,MeanHigh,MeanLow,MeanClose,MeanVolume,MeanPercentReturn
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AA,-1.158886,-1.153217,-1.159624,-1.153417,0.103184,-1.610917
AXP,-0.261157,-0.254393,-0.265739,-0.258185,-0.539199,-0.919042
BA,0.575177,0.580564,0.564695,0.578916,-0.615824,-0.329342
BAC,-1.229631,-1.224851,-1.230018,-1.225585,4.361945,-1.960402
CAT,1.494216,1.505195,1.467062,1.500534,-0.586091,-0.800586


In [6]:
# Encode the Sector column
sector_encoded_df = pd.get_dummies(df_stocks["Sector"])

# Review the DataFrame
sector_encoded_df.head()

Unnamed: 0_level_0,Communication,Consumer Discretionary,Consumer Staples,Energy,Financials,Health Care,Industrials,Information Technology
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AA,0,0,0,0,0,0,1,0
AXP,0,0,0,0,1,0,0,0
BA,0,0,0,0,0,0,1,0
BAC,0,0,0,0,1,0,0,0
CAT,0,0,0,0,0,0,1,0


In [7]:
# Concatenate the Sector encoded data with the scaled data DataFrame
df_stocks_scaled = pd.concat([df_stocks_scaled, sector_encoded_df], axis=1)

# Display the concatenated DataFrame
df_stocks_scaled.head()

Unnamed: 0_level_0,MeanOpen,MeanHigh,MeanLow,MeanClose,MeanVolume,MeanPercentReturn,Communication,Consumer Discretionary,Consumer Staples,Energy,Financials,Health Care,Industrials,Information Technology
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
AA,-1.158886,-1.153217,-1.159624,-1.153417,0.103184,-1.610917,0,0,0,0,0,0,1,0
AXP,-0.261157,-0.254393,-0.265739,-0.258185,-0.539199,-0.919042,0,0,0,0,1,0,0,0
BA,0.575177,0.580564,0.564695,0.578916,-0.615824,-0.329342,0,0,0,0,0,0,1,0
BAC,-1.229631,-1.224851,-1.230018,-1.225585,4.361945,-1.960402,0,0,0,0,1,0,0,0
CAT,1.494216,1.505195,1.467062,1.500534,-0.586091,-0.800586,0,0,0,0,0,0,1,0


In [8]:
# Initialize the K-Means model with n_clusters=3,  n_init='auto', and random_state=1
model = KMeans(n_clusters=3, n_init='auto', random_state=1)

# Fit the model for the df_stocks_scaled DataFrame
model.fit(df_stocks_scaled)

In [9]:
# Predict the model segments (clusters)
stock_clusters = model.predict(df_stocks_scaled)

# View the stock segments
print(stock_clusters)

[0 1 1 0 2 0 2 1 1 0 1 1 2 0 1 1 1 1 1 2 1 0 0 1 1 1 2 1 1 2]


In [10]:
# Create a copy of the concatenated DataFrame
df_stocks_scaled_predictions = df_stocks_scaled.copy()

# Create a new column in the copy of the concatenated DataFrame with the predicted clusters
df_stocks_scaled_predictions["StockCluster"] = stock_clusters

# Review the DataFrame
df_stocks_scaled_predictions

Unnamed: 0_level_0,MeanOpen,MeanHigh,MeanLow,MeanClose,MeanVolume,MeanPercentReturn,Communication,Consumer Discretionary,Consumer Staples,Energy,Financials,Health Care,Industrials,Information Technology,StockCluster
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
AA,-1.158886,-1.153217,-1.159624,-1.153417,0.103184,-1.610917,0,0,0,0,0,0,1,0,0
AXP,-0.261157,-0.254393,-0.265739,-0.258185,-0.539199,-0.919042,0,0,0,0,1,0,0,0,1
BA,0.575177,0.580564,0.564695,0.578916,-0.615824,-0.329342,0,0,0,0,0,0,1,0,1
BAC,-1.229631,-1.224851,-1.230018,-1.225585,4.361945,-1.960402,0,0,0,0,1,0,0,0,0
CAT,1.494216,1.505195,1.467062,1.500534,-0.586091,-0.800586,0,0,0,0,0,0,1,0,2
CSCO,-1.057671,-1.05773,-1.061867,-1.063041,1.379066,-1.20315,0,0,0,0,0,0,0,1,0
CVX,1.415534,1.427278,1.425949,1.433089,-0.522031,0.141288,0,0,0,1,0,0,0,0,2
DD,-0.025298,-0.021919,-0.032543,-0.017114,-0.578096,0.278991,0,0,0,0,0,0,1,0,1
DIS,-0.376383,-0.365547,-0.372005,-0.361675,-0.461793,0.847408,1,0,0,0,0,0,0,0,1
GE,-1.04675,-1.043496,-1.045849,-1.041325,1.094627,0.011832,0,0,0,0,0,0,1,0,0
