* Source : https://medium.com/plotly/nlp-visualisations-for-clear-immediate-insights-into-text-data-and-outputs-9ebfab168d5b

In [1]:
import plotly.express as px
import pandas as pd
from sklearn.manifold import TSNE


In [2]:
from google.colab import drive
isMount=!df |grep /content/drive
if len(isMount) < 1 : drive.mount('/content/drive')


In [3]:
base_path="/content/drive/MyDrive/Lec_Capture/_dataNLP/dash_nlp"


In [4]:
desired_width = 320
pd.set_option("display.max_columns", 20)
pd.set_option("display.width", desired_width)
comp_df = pd.read_csv(f"{base_path}/customer_complaints_narrative_sample.csv", index_col=0)



# ========== EXPLORATORY DATA ANALYSIS ==========


# Complaints by date


In [5]:

fig = px.histogram(
    comp_df, x="datetime", template="plotly_white", title="Complaint counts by date"
)
fig.update_xaxes(categoryorder="category descending", title="Date").update_yaxes(
    title="Number of complaints"
)
fig.update_layout(width=1200, height=500)
fig.show()



# Complaints by words


In [6]:
fig = px.histogram(
    comp_df,
    x="Words_clipped",
    template="plotly_white",
    title="Complain counts by length",
)
fig.update_xaxes(
    categoryorder="total descending",
    title="Number of words (clipped at 1000 words)",
).update_yaxes(title="Number of complaints")
fig.update_layout(width=1200, height=500)
fig.show()

# There are too many companies to display - let's just show a few companies


# ========== Top n companies only ==========


# Pre-processing data


In [7]:

top_comps = (
    comp_df.groupby("Company")["Date received"]
    .count()
    .sort_values(ascending=False)[:10]
    .index
)
top_comps_df = comp_df[comp_df["Company"].isin(top_comps)]



# Top companies by complaints


In [8]:

fig = px.histogram(
    top_comps_df,
    x="Company",
    template="plotly_white",
    title="Complaint counts by company",
)
fig.update_xaxes(categoryorder="total descending").update_yaxes(
    title="Number of complaints"
)
fig.update_layout(width=1200, height=500)
fig.show()



# Complaints by company & date


In [9]:

fig = px.histogram(
    top_comps_df,
    x="datetime",
    template="plotly_white",
    title="Complaint counts by date & company",
    color="Company",
    nbins=6,
    log_y=True,
    barmode="group",
)
fig.update_xaxes(categoryorder="category descending", title="Date").update_yaxes(
    title="Number of complaints"
)
fig.update_layout(width=1200, height=500)
fig.show()



# ========== PLOT N-GRAM RELATED DATA HERE ==========


In [10]:

bigram_df = pd.read_csv(f"{base_path}/bigram_data.csv", index_col=0)
fig = px.bar(
    bigram_df[:20],
    x="ngram",
    y="count",
    title="Counts of top bigrams",
    template="plotly_white",
    labels={"ngram": "Bigram", "count": "Count"},
)
fig.update_layout(width=1200, height=500)
fig.show()



# Hierarchical Treemap

In [11]:
fig = px.treemap(
    names = ["Eve","Cain", "Seth", "Enos", "Noam", "Abel", "Awan", "Enoch", "Azura"],
    parents = ["", "Eve", "Eve", "Seth", "Seth", "Eve", "Eve", "Awan", "Eve"]
)
fig.update_layout(width=1200, height=600)
fig.show()



# Visualising proportions


In [12]:
comp_grp_df = pd.read_csv(f"{base_path}/comp_bigram_data.csv", index_col=0)
# fig = px.scatter(comp_grp_df, x='bigram', y='company', size='count', color='Words', template='plotly_white',
#                  labels={'Words':'Length<BR>(words)', 'bigram': 'Bigram', 'company': 'Company'},
#                  category_orders=top_comps, range_color=[150, 450], color_continuous_scale=px.colors.sequential.YlOrRd)
# fig.update_traces(marker=dict(line=dict(width=1, color='Gray')))
# fig.update_layout(width=1200, height=500)
# fig.show()

fig = px.bar(
    comp_grp_df,
    x="portion",
    y="company",
    template="plotly_white",
    orientation="h",
    labels={"portion": "% of Complaints", "bigram": "Bigram", "company": "Company"},
    color="bigram",
    color_discrete_sequence=px.colors.qualitative.Safe,
)
fig.update_layout(font=dict(size=10, color="DarkSlateGray"))
fig.update_layout(width=1200, height=500)
fig.show()



# ========== PLOT N-GRAM RELATED DATA HERE ==========


In [13]:

vects_df = pd.read_csv(f"{base_path}/bigram_vectors.csv", index_col=0)
embed_df = pd.read_csv(
    f"{base_path}/tsne_bigram_data.csv", index_col=0
)  # Bigram embedding dataframe, with placeholder tsne values (at perplexity=3)



# Try different t-SNE values here


In [14]:
X_embedded = TSNE(n_components=2, perplexity=3).fit_transform(vects_df)
embed_df["tsne_1"] = X_embedded[:, 0]
embed_df["tsne_2"] = X_embedded[:, 1]



# Plot t-SNE graph


In [15]:

fig = px.scatter(
    embed_df,
    x="tsne_1",
    y="tsne_2",
    hover_name="bigram",
    text="bigram",
    size="count",
    color="words",
    size_max=45,
    template="plotly_white",
    title="Bigram similarity and frequency",
    labels={"words": "Avg. Length<BR>(words)"},
    color_continuous_scale=px.colors.sequential.Sunsetdark,
)
fig.update_traces(marker=dict(line=dict(width=1, color="Gray")))
fig.update_xaxes(visible=False)
fig.update_yaxes(visible=False)
fig.update_layout(width=1200, height=500)
fig.show()


