In [3]:
# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd

# Libraries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style='darkgrid')

# Removes the limit for the number of displayed columns
pd.set_option("display.max_columns", None)
# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 200)

# to scale the data using z-score
from sklearn.preprocessing import StandardScaler

# to compute distances
from scipy.spatial.distance import cdist, pdist

# to perform k-means clustering and compute silhouette scores
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# to visualize the elbow curve and silhouette scores
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer

# to perform hierarchical clustering, compute cophenetic correlation, and create dendrograms
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet

# to suppress warnings
import warnings
warnings.filterwarnings("ignore")
# ----------------------------------------------------------------------------------------------
# ChatGPT3 recommended libraries:
from scipy.spatial.distance import pdist, squareform

# https://docs.xlwings.org/en/latest/jupyternotebooks.html
from xlwings import view
from xlwings import load
# ----------------------------------------------------------------------------------------------
## Complete the code to import the data
data = pd.read_csv('../data/stock_data.csv')
# ----------------------------------------------------------------------------------------------
object_list=[]
for col in data.columns:
    if data[col].dtype == 'object':
        object_list.append(col)
object_list
# ----------------------------------------------------------------------------------------------
# copying the data to another variable to avoid any changes to original data
df = data.copy()

In [4]:
# function to create labeled barplots


def labeled_barplot(df, feature, perc=False, n=None):
    """
    Barplot with percentage at the top

    data: dataframe
    feature: dataframe column
    perc: whether to display percentages instead of count (default is False)
    n: displays the top n category levels (default is None, i.e., display all levels)
    """

    total = len(df[feature])  # length of the column
    count = df[feature].nunique()
    if n is None:
        plt.figure(figsize=(count + 1, 5))
    else:
        plt.figure(figsize=(n + 1, 5))

    plt.xticks(rotation=90, fontsize=15)
    ax = sns.countplot(
        data=df,
        x=feature,
        palette="Paired",
        order=df[feature].value_counts().index[:n].sort_values(),
    )

    for p in ax.patches:
        if perc == True:
            label = "{:.1f}%".format(
                100 * p.get_height() / total
            )  # percentage of each class of the category
        else:
            label = p.get_height()  # count of each level of the category

        x = p.get_x() + p.get_width() / 2  # width of the plot
        y = p.get_height()  # height of the plot

        ax.annotate(
            label,
            (x, y),
            ha="center",
            va="center",
            size=12,
            xytext=(0, 5),
            textcoords="offset points",
        )  # annotate the percentage

    plt.show()  # show the plot

ChatGPT3: Question:

    step 0: for this request I just want to change step 2 labeled barplot and call it so that it 
    gets the count of each GCIS Sub Industry  and plots descending count  of bars, not ordered by alphabetical first.  step 1 : import packages, load data.csv, get the features that are of dtype == object, copy data to df as follows: ```# Libraries to help with reading and manipulating data
    import numpy as np
    import pandas as pd

    # Libraries to help with data visualization
    import matplotlib.pyplot as plt
    import seaborn as sns

    sns.set_theme(style='darkgrid')

    # Removes the limit for the number of displayed columns
    pd.set_option("display.max_columns", None)
    # Sets the limit for the number of displayed rows
    pd.set_option("display.max_rows", 200)

    # to scale the data using z-score
    from sklearn.preprocessing import StandardScaler

    # to compute distances
    from scipy.spatial.distance import cdist, pdist

    # to perform k-means clustering and compute silhouette scores
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_score

    # to visualize the elbow curve and silhouette scores
    from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer

    # to perform hierarchical clustering, compute cophenetic correlation, and create dendrograms
    from sklearn.cluster import AgglomerativeClustering
    from scipy.cluster.hierarchy import dendrogram, linkage, cophenet

    # to suppress warnings
    import warnings
    warnings.filterwarnings("ignore")
    # ----------------------------------------------------------------------------------------------
    # ChatGPT3 recommended libraries:
    from scipy.spatial.distance import pdist, squareform

    # https://docs.xlwings.org/en/latest/jupyternotebooks.html
    from xlwings import view
    from xlwings import load
    # ----------------------------------------------------------------------------------------------
    ## Complete the code to import the data
    data = pd.read_csv('../data/stock_data.csv')
    # ----------------------------------------------------------------------------------------------
    object_list=[]
    for col in data.columns:
        if data[col].dtype == 'object':
            object_list.append(col)
    object_list
    # ----------------------------------------------------------------------------------------------
    # copying the data to another variable to avoid any changes to original data
    df = data.copy()``` step 2: define labeled barplot based on sns.countplot as follows: ```# function to create labeled barplots


    def labeled_barplot(df, feature, perc=False, n=None):
        """
        Barplot with percentage at the top

        data: dataframe
        feature: dataframe column
        perc: whether to display percentages instead of count (default is False)
        n: displays the top n category levels (default is None, i.e., display all levels)
        """

        total = len(df[feature])  # length of the column
        count = df[feature].nunique()
        if n is None:
            plt.figure(figsize=(count + 1, 5))
        else:
            plt.figure(figsize=(n + 1, 5))

        plt.xticks(rotation=90, fontsize=15)
        ax = sns.countplot(
            data=df,
            x=feature,
            palette="Paired",
            order=df[feature].value_counts().index[:n].sort_values(),
        )

        for p in ax.patches:
            if perc == True:
                label = "{:.1f}%".format(
                    100 * p.get_height() / total
                )  # percentage of each class of the category
            else:
                label = p.get_height()  # count of each level of the category

            x = p.get_x() + p.get_width() / 2  # width of the plot
            y = p.get_height()  # height of the plot

            ax.annotate(
                label,
                (x, y),
                ha="center",
                va="center",
                size=12,
                xytext=(0, 5),
                textcoords="offset points",
            )  # annotate the percentage

        plt.show()  # show the plot``` step 3 (should have stated first but) data.csv is fake simulated stock data: here are the first baker's dozen of rows: ```Ticker Symbol,Security,GICS Sector,GICS Sub Industry,Current Price,Price Change,Volatility,ROE,Cash Ratio,Net Cash Flow,Net Income,Earnings Per Share,Estimated Shares Outstanding,P/E Ratio,P/B Ratio
    AAL,American Airlines Group,Industrials,Airlines,42.349998,9.999994805,1.687151059,135,51,-604000000,7610000000,11.39,668129938.5,3.718173661,-8.784219448
    ABBV,AbbVie,Health Care,Pharmaceuticals,59.240002,8.339433065,2.19788722,130,77,51000000,5144000000,3.15,1633015873,18.80634984,-8.75006804
    ABT,Abbott Laboratories,Health Care,Health Care Equipment,44.91,11.30112076,1.273646006,21,67,938000000,4423000000,2.94,1504421769,15.2755102,-0.394171377
    ADBE,Adobe Systems Inc,Information Technology,Application Software,93.940002,13.97719519,1.357678918,9,180,-240840000,629551000,1.26,499643650.8,74.55555714,4.199651085
    ADI,"Analog Devices, Inc.",Information Technology,Semiconductors,55.32,-1.827858095,1.701168792,14,272,315120000,696878000,0.31,2247993548,178.4516129,1.059809981
    ADM,Archer-Daniels-Midland Co,Consumer Staples,Agricultural Products,36.68,-12.01726822,1.516492636,10,49,-189000000,1849000000,2.99,618394648.8,12.26755853,7.496830719
    ADS,Alliance Data Systems,Information Technology,Data Processing & Outsourced Services,276.570007,6.189285566,1.116976328,30,25,90885000,596541000,8.91,66951851.85,31.04040483,129.0645854
    AEE,Ameren Corp,Utilities,MultiUtilities,43.23,2.174424435,1.124186428,9,14,287000000,636000000,2.6,244615384.6,16.62692308,-0.719496855
    AEP,American Electric Power,Utilities,Electric Utilities,58.27,2.371753421,1.06848509,11,9,13900000,2052300000,3.13,421897810.2,18.45654341,-3.022648785
    AFL,AFLAC Inc,Financials,Life & Health Insurance,59.900002,3.027180995,1.04829468,14,99,-308000000,2533000000,5.88,430782312.9,10.18707517,-1.883912008
    AIG,"American International Group, Inc.",Financials,Property & Casualty Insurance,61.970001,8.358108207,1.106965388,2,99,-129000000,2196000000,1.69,1299408284,36.66863964,-4.327138294
    AIV,Apartment Investment & Mgmt,Real Estate,REITs,40.029999,7.578608105,1.163333638,15,47,21818000,248710000,1.52,163625000,26.33552566,-1.269332159
    ```

ChatGPT3 response (adapted):

step 0: labeled_barplot will be modified so that it can receive a parameter 'sort_by_count' to change the way the countplot is ordered

- If sort_by_count is True, then the countplot should be ordered by the descending count of each category. Otherwise, the countplot should be ordered by alphabetical order.



step 1: Import packages, load data.csv, get the features that are of dtype == object, copy data to df as follows:

    # Libraries to help with reading and manipulating data
    import numpy as np
    import pandas as pd

    # Libraries to help with data visualization
    import matplotlib.pyplot as plt
    import seaborn as sns

    sns.set_theme(style='darkgrid')

    # Removes the limit for the number of displayed columns
    pd.set_option("display.max_columns", None)
    # Sets the limit for the number of displayed rows
    pd.set_option("display.max_rows", 200)

    # to scale the data using z-score
    from sklearn.preprocessing import StandardScaler

    # to compute distances
    from scipy.spatial.distance import cdist, pdist

    # to perform k-means clustering and compute silhouette scores
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_score

    # to visualize the elbow curve and silhouette scores
    from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer

    # to perform hierarchical clustering, compute cophenetic correlation, and create dendrograms
    from sklearn.cluster import AgglomerativeClustering
    from scipy.cluster.hierarchy import dendrogram, linkage, cophenet

    # to suppress warnings
    import warnings
    warnings.filterwarnings("ignore")

    # -----------------------------------------------------------------------------------------------------------
    # ChatGPT3 recommended libraries:
    from scipy.spatial.distance import pdist, squareform
    from xlwings import view
    from xlwings import load
    # -----------------------------------------------------------------------------------------------------------
    ## Complete the code to import the data
    data = pd.read_csv('../data/stock_data.csv')
    # -----------------------------------------------------------------------------------------------------------
    object_list = [col for col in data.columns if data[col].dtype == 'object']
    # -----------------------------------------------------------------------------------------------------------
    # copying the data to another variable to avoid any changes to original data
    df = data.copy()


above markdown cell as code:

In [5]:
# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd

# Libraries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style='darkgrid')

# Removes the limit for the number of displayed columns
pd.set_option("display.max_columns", None)
# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 200)

# to scale the data using z-score
from sklearn.preprocessing import StandardScaler

# to compute distances
from scipy.spatial.distance import cdist, pdist

# to perform k-means clustering and compute silhouette scores
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# to visualize the elbow curve and silhouette scores
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer

# to perform hierarchical clustering, compute cophenetic correlation, and create dendrograms
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet

# to suppress warnings
import warnings
warnings.filterwarnings("ignore")

# -----------------------------------------------------------------------------------------------------------
# ChatGPT3 recommended libraries:
from scipy.spatial.distance import pdist, squareform
from xlwings import view
from xlwings import load
# -----------------------------------------------------------------------------------------------------------
## Complete the code to import the data
data = pd.read_csv('../data/stock_data.csv')
# -----------------------------------------------------------------------------------------------------------
object_list = [col for col in data.columns if data[col].dtype == 'object']
# -----------------------------------------------------------------------------------------------------------
# copying the data to another variable to avoid any changes to original data
df = data.copy()


step 2: The modified function labeled_barplot based on sns.countplot is as follows:

In [6]:
def labeled_barplot(df, feature, perc=False, n=None, sort_by_count=False):
    """
    Barplot with percentage at the top

    data: dataframe
    feature: dataframe column
    perc: whether to display percentages instead of count (default is False)
    n: displays the top n category levels (default is None, i.e., display all levels)
    sort_by_count: whether to sort the categories by count (default is False)
    """

    total = len(df[feature])  # length of the column
    count = df[feature].nunique()
    if n is None:
        plt.figure(figsize=(count + 1, 5))
    else:
        plt.figure(figsize=(n + 1, 5))

    plt.xticks(rotation=90, fontsize=15)
    if sort_by_count:
        ax = sns.countplot(
            data=df,
            x=feature,
            palette="Paired",
            order=df[feature].value_counts().sort_values(ascending=False).index[:n],
        )
    else:
        ax = sns.countplot(
            data=df,
            x=feature,
            palette="Paired",
            order=df[feature].sort_values().unique()[:n],
        )

    for p in ax.patches:
        if perc == True:
            label = "{:.1f}%".format(
                100 * p.get_height() / total
            )  # percentage of each class of the category