In [None]:
import numpy as np
import pandas as pd


"""
https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76

https://medium.com/analytics-vidhya/a-simple-yet-effective-way-of-text-cleaning-using-nltk-4f90a8ff21d4
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html

"""

In [None]:
from typing import List, Dict
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfTransformer
stopwords.words('english')

def drop_stop_words(df: pd.DataFrame) -> pd.DataFrame:
    print(df.shape)
    words_to_drop = [col for col in df.columns if col in stopwords.words('english')]
    df.drop(words_to_drop, axis=1, inplace=True)
    print(df.shape)
    return df

def get_correlation_pairs(tfidf, corr_type):
    correlations = tfidf.corr(corr_type)
    correlations_stacked = correlations.abs().unstack().sort_values(ascending=False)
    correlations_stacked = correlations_stacked[correlations_stacked < 1]
    return correlations_stacked

def combone_pairs(X, strong_corr_pairs):
    droped = []
    for pair in strong_corr_pairs:
        pair_splited = pair.split("_")
        if all([False if word in droped else True for word in pair_splited]):
            X[pair] = X[pair_splited].mean(axis=1)
            X.drop(pair_splited, axis=1, inplace=True)
            droped.extend(pair_splited)
    print(X.shape)
    return X

def run_transformer(X):
    transformer = TfidfTransformer()
    tfidf_sk = pd.DataFrame(
        transformer.fit_transform(X.values).toarray(),
        columns=X.columns,
        index=X.index
    )
    return transformer, tfidf_sk

In [None]:
df

In [None]:
from scipy.stats import chi2_contingency
  
# defining the table
data = [[207, 282, 241], [234, 242, 232]]
stat, p, dof, expected = chi2_contingency(data)
  

In [None]:
df = pd.read_csv('text_training.csv', index_col="ID") # rollout_X

In [None]:
df = pd.read_csv('text_training.csv', index_col="ID") # rollout_X
df = df[df.sum().sort_values(ascending=False).index]
df = df.loc[df.sum(axis=1).sort_values(ascending=False).index]
df = drop_stop_words(df)

In [None]:
label_freq = df.groupby(["rating"]).mean().T.rename(columns={1:"label_freq"})
label_freq["diff"] = label_freq["label_freq"].sub(label_freq[0]).abs()
label_freq = label_freq.sort_values("diff", ascending=False).reset_index()

In [None]:
fig = px.scatter(
    label_freq, x="diff", y="index", #color="rating_freq_log_abs", 
#                  hover_data=["count_review", "rating_freq_log_abs"]
)
fig.show()

In [None]:

from scipy.stats import chi2_contingency
  

stat, p, dof, expected = chi2_contingency(df[["rating", "love"]].values)
  
# interpret p-value
alpha = 0.05
print("p value is " + str(p))
if p <= alpha:
    print('Dependent (reject H0)')
else:
    print('Independent (H0 holds true)')

In [None]:
X, y = df.drop("rating", axis=1).copy(), df["rating"].copy()

transformer = TfidfTransformer()
tfidf_sk = pd.DataFrame(
    transformer.fit_transform(X.values).toarray(),
    columns=X.columns,
    index=X.index
)

In [None]:
correlations_pearson = get_correlation_pairs(tfidf_sk, "pearson")
correlations_spearman = get_correlation_pairs(tfidf_sk, "spearman")

correlations_stacked = (correlations_pearson.to_frame()
 .rename(columns={0:"pearson"})
 .join(correlations_spearman.to_frame()
       .rename(columns={0:"spearman"})))

correlations_stacked = correlations_stacked.iloc[range(0, len(correlations_stacked), 2)]
correlations_stacked.index = ["_".join(pair) for pair in correlations_stacked.index.to_list()]

In [None]:
strong_corr = []
for cor in ["pearson", "spearman"]:
    strong_corr.extend(correlations_stacked["pearson"].sort_values(ascending=False).head(60).index.to_list())
strong_corr_pairs = list(set(strong_corr))

In [None]:
import plotly.graph_objects as go

correlations_stacked_plot = correlations_stacked.loc[strong_corr].reset_index().sort_values("pearson")
fig = go.Figure()

for cor in ["pearson", "spearman"]:

# Add traces
    fig.add_trace(
        go.Scatter(
            y=correlations_stacked_plot["index"], 
            x=correlations_stacked_plot[cor],
            mode='markers',
            text=correlations_stacked_plot[["pearson", "spearman"]],
            name=cor)
    )


fig.show()

In [None]:
X, y = combone_pairs(df.drop("rating", axis=1), strong_corr_pairs), df["rating"]
_, tfidf_sk = run_transformer(X)

In [None]:
import plotly.graph_objects as go

tfidf_sk_plot = (tfidf_sk
#  .replace({0:None})
 .mean(axis=0).to_frame().rename(columns={0:"tfidf"})
 .join(
     X.replace({0:None})
     .notnull().sum(axis=0)
     .to_frame()
     .rename(columns={0:"count_review"}))
 .sort_values("tfidf", ascending=False)
 .join(
     X
     .join(y.to_frame())
     .groupby("rating", as_index=False)
     .mean().T[1]
     .to_frame()
     .rename(
         columns={1:"rating_freq"}
     )       
)).reset_index().sort_values("tfidf", ascending=False)

tfidf_sk_plot["rating_freq_log"] = np.log(tfidf_sk_plot["rating_freq"])
tfidf_sk_plot["rating_freq_log_add"] = np.log(tfidf_sk_plot["rating_freq"].add(1e-06))

tfidf_sk_plot["rating_freq_log_abs"] = tfidf_sk_plot["rating_freq_log"].abs()
tfidf_sk_plot.loc[tfidf_sk_plot["rating_freq_log"]==-np.inf, "rating_freq_log_abs"] = 0

In [None]:
fig = px.scatter(
    tfidf_sk_plot, x="tfidf", y="index", color="rating_freq_log_abs", 
                 hover_data=["count_review", "rating_freq_log_abs"]
)
fig.show()

In [None]:
high_tfidf_value: list = tfidf_sk_plot[tfidf_sk_plot["tfidf"] < 5e-03]["index"].to_list()
_, tfidf_sk = run_transformer(X.filter(high_tfidf_value, axis=1))

In [None]:
tfidf_sk

In [253]:
import plotly.graph_objects as go

correlations_stacked_plot = correlations_stacked.loc[strong_corr].reset_index().sort_values("pearson")
fig = go.Figure()

for cor in ["pearson", "spearman"]:

# Add traces
    fig.add_trace(
        go.Scatter(
            y=correlations_stacked_plot["index"], 
            x=correlations_stacked_plot[cor],
            mode='markers',
            text=correlations_stacked_plot[["pearson", "spearman"]],
            name=cor)
    )


fig.show()

In [256]:
X, y = combone_pairs(df.drop("rating", axis=1), strong_corr_pairs), df["rating"]
_, tfidf_sk = run_transformer(X)

In [408]:
import plotly.graph_objects as go

tfidf_sk_plot = (tfidf_sk
#  .replace({0:None})
 .mean(axis=0).to_frame().rename(columns={0:"tfidf"})
 .join(
     X.replace({0:None})
     .notnull().sum(axis=0)
     .to_frame()
     .rename(columns={0:"count_review"}))
 .sort_values("tfidf", ascending=False)
 .join(
     X
     .join(y.to_frame())
     .groupby("rating", as_index=False)
     .mean().T[1]
     .to_frame()
     .rename(
         columns={1:"rating_freq"}
     )       
)).reset_index().sort_values("tfidf", ascending=False)

tfidf_sk_plot["rating_freq_log"] = np.log(tfidf_sk_plot["rating_freq"])
tfidf_sk_plot["rating_freq_log_add"] = np.log(tfidf_sk_plot["rating_freq"].add(1e-06))

tfidf_sk_plot["rating_freq_log_abs"] = tfidf_sk_plot["rating_freq_log"].abs()
tfidf_sk_plot.loc[tfidf_sk_plot["rating_freq_log"]==-np.inf, "rating_freq_log_abs"] = 0


divide by zero encountered in log



In [412]:
fig = px.scatter(
    tfidf_sk_plot, x="tfidf", y="index", color="rating_freq_log_abs", 
                 hover_data=["count_review", "rating_freq_log_abs"]
)
fig.show()

In [449]:
high_tfidf_value: list = tfidf_sk_plot[tfidf_sk_plot["tfidf"] < 5e-03]["index"].to_list()
_, tfidf_sk = run_transformer(X.filter(high_tfidf_value, axis=1))

In [450]:
tfidf_sk

Unnamed: 0_level_0,varieti,hope,bake,couldnt,abl,chang,cream_ice,pet,wish,gave,...,toxic,refil,trap_mebr,push,carbohydr,steep,canin,report,muffin,lie
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
716,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0000,...,0.0,0.0,0.0,0.073343,0.000000,0.0,0.0,0.0,0.0,0.077051
1478,0.0,0.0,0.0,0.0,0.0,0.100365,0.0,0.0,0.0,0.0345,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.107827
579,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0000,...,0.0,0.0,0.0,0.000000,0.081082,0.0,0.0,0.0,0.0,0.000000
1282,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0000,...,0.0,0.0,0.0,0.000000,0.081082,0.0,0.0,0.0,0.0,0.000000
1244,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0000,...,0.0,0.0,0.0,0.000000,0.081082,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1789,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0000,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000
490,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0000,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000
873,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0000,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000
1758,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0000,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000


grey_earl          8.535696e-01
jack_link          8.533437e-01
site_web           6.418496e-01
pill_pocket        6.267510e-01
custom_servic      6.193593e-01
                       ...     
combin_descript    1.313117e-07
final_act          6.832005e-08
true_half          6.098256e-08
huge_wish          4.934614e-08
improv_long        1.910294e-08
Length: 1122751, dtype: float64

In [12]:
correlations_stacked_plot

Unnamed: 0,index,corr
0,grey_earl,0.853570
1,jack_link,0.853344
2,site_web,0.641850
3,pill_pocket,0.626751
4,custom_servic,0.619359
...,...,...
1053,imagin_lbs,0.200221
1054,chip_these,0.200165
1055,shell_occasion,0.200153
1056,berri_question,0.200086


In [14]:
import plotly.express as px


In [17]:
help(px.bar)

Help on function bar in module plotly.express._chart_types:

bar(data_frame=None, x=None, y=None, color=None, pattern_shape=None, facet_row=None, facet_col=None, facet_col_wrap=0, facet_row_spacing=None, facet_col_spacing=None, hover_name=None, hover_data=None, custom_data=None, text=None, base=None, error_x=None, error_x_minus=None, error_y=None, error_y_minus=None, animation_frame=None, animation_group=None, category_orders=None, labels=None, color_discrete_sequence=None, color_discrete_map=None, color_continuous_scale=None, pattern_shape_sequence=None, pattern_shape_map=None, range_color=None, color_continuous_midpoint=None, opacity=None, orientation=None, barmode='relative', log_x=False, log_y=False, range_x=None, range_y=None, text_auto=False, title=None, template=None, width=None, height=None)
        In a bar plot, each row of `data_frame` is represented as a rectangular
        mark.
        
    Parameters
    ----------
    data_frame: DataFrame or array-like or dict
        

In [None]:
correlations.loc[correlations[correlations > 0.01].count() > 2]

In [None]:
correlations.abs().apply(lambda x: x if x[x > 0.15].count() > 2 else None , axis=1)

In [None]:
fig = px.imshow(tfidf_sk.corr(), text_auto=True, aspect="auto")
fig.show()


In [None]:


"""
instaed of droping 'nobr' non exsting word will add 1 to all the words
"""
tfidf_local: pd.DataFrame = calc_tfidf(X.drop("nobr", axis=1))
tfidf_local.loc[tfidf_local.sum(axis=1).sort_values(ascending=False).index].head(5)
