### Machine Learning Models

#### Changes in this version
* Removed the FD Growth Column as a FEATURE
* Increased TEST Size (357) to 40% from 30%
*

In [None]:
%matplotlib inline

import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Variables Definition


In [None]:
# Specify the Excel spreadsheet/ Worksheet to use for our Machine Learning
filename = 'data/NEW-v6-ml-ELECTRONICS-Rev-Data-03-30-30.xlsx'
worksheet = 'ELECTRONICS'

# Put in a decimal to represent the cutoff for labeling the dataset
# Example - For .90 we would select the TOP 10% of stocks to label as 1 Representing High Growth
label_selection = .80

In [None]:
ml_data_df = pd.read_excel(filename, sheet_name= worksheet)
 
print("Column headings:")
print(ml_data_df.columns)

In [None]:
ml_data_df.shape

In [None]:
ml_data_df.corr()

In [None]:
ml_data_df.head()

#### This section is for finding the relative Price performers and labeling the Data 

In [None]:
# SECTION to SELECT TOP Price performers in the training DataSet 

# Get text label for last colum for Price Change
price_column = ml_data_df.iloc[:,-1]

# Determine the top xx percent Stock Price performers for the year within the industry data.
top_price_perf_pct = ml_data_df[price_column.name].quantile(label_selection)
print("We will label stocks as 1 that have at least " + str(ml_data_df[price_column.name].quantile(label_selection)) + " % Sales Growth " )

#### Create and populate the hi_growth column for our ML Label

In [None]:
ml_data_df['hi_growth'] = np.where(ml_data_df[price_column.name] >= top_price_perf_pct, 1, 0)
ml_data_df.head()

#### Remove unneeded columns


In [None]:
# Remove 7 Columns (1-7) for comp_name, sector_desc, ind_desc, mkt_val, prev_2_year, prev_year, curr_year
# Remove 1 Column for Price Change since we already have this in our hi_growth column
# To keep the annual earnings data columns, change 1,2,3,4,5,6,7,-1 below to 1,2,3,4,-1

drop_cols = [1,2,3,4,5,6,7,-2]
ml_data_df.drop(ml_data_df.columns[drop_cols],axis=1,inplace=True)
ml_data_df.head()

### VISUALIZE THE DATA

In [None]:
def plot_corr(df, size=8):
    '''
    Function plots a graphical correlation matrix for each pair of columns in the dataframe.

    Input:
        df: pandas DataFrame
        size: vertical and horizontal size of the plot

    Displays:
        matrix of correlation between columns.  Blue-cyan-yellow-red-darkred => less to more correlated
                                                0 ------------------>  1
                                                Expect a darkred line running from top left to bottom right
    '''

    corr = df.corr()    # data frame correlation function
    fig, ax = plt.subplots(figsize=(size, size))
    ax.matshow(corr)   # color code the rectangles by correlation value
    plt.xticks(range(len(corr.columns)), corr.columns)  # draw x tick marks
    plt.yticks(range(len(corr.columns)), corr.columns)  # draw y tick marks

plot_corr(ml_data_df)

In [None]:
# THIS IS GOING TO BE A VERY LARGE PLOT
sns.pairplot(ml_data_df,hue='hi_growth',palette='coolwarm')

## ML Section 1 - Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
labels = ml_data_df['hi_growth']

# Below sets features to JUST Annual Simple Growth of Revenue
features = ml_data_df.iloc[:,1:3]
# Below sets features to Annual FD Growth + Annual Simple Growth of Revenue
# features = ml_data_df.iloc[:,1:3]

features.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features,labels, test_size=0.40)

### ML Section 2 - Instantiate KNN Model

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)

Fit this KNN model to the training data.

In [None]:
knn.fit(X_train,y_train)

### ML Section 3 - Model Evaluation

In [None]:
pred = knn.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,pred))

In [None]:
print(classification_report(y_test,pred))

### Choose A better K Value

In [None]:
error_rate = []

# Will take some time
for i in range(1,40):
    
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1,40),error_rate,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

In [None]:
# NOW WITH K=6
knn = KNeighborsClassifier(n_neighbors=26,weights='distance')

knn.fit(X_train,y_train)
pred = knn.predict(X_test)

print('WITH K=6')
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))