In [1]:
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import seaborn as sns

matplotlib.rcParams['figure.figsize'] = (15, 15)
plt.rcParams['figure.constrained_layout.use'] = True

In [31]:
# load data
df = pd.read_csv("../../data/random_samples/stratified_sampling_clean_text_data_by_price_whigh_sz50000_1619835594.csv")

In [3]:
df.head(2)

Unnamed: 0,train_id,clean_item_description,item_description_bef_word_count,item_description_bef_char_count,item_description_bef_avg_word_len,item_description_upper_word_count,item_description_upper_char_count,item_description_stopword_count,item_description_punctuation_count,item_description_number_count,...,item_name_after_avg_word_len,item_condition_id,category_name,brand_name,shipping,price,c1,c2,c3,price_bin
0,806824,new tags,3.0,13.0,4.333333,0.0,1.0,1.0,0.0,0.0,...,5.25,1,Women/Athletic Apparel/Shirts & Tops,Nike,1,15.0,women,athletic apparel,shirts & tops,"(10, 15]"
1,772820,nastasya every hills lipstick fashion,6.0,42.0,7.0,0.0,4.0,1.0,0.0,0.0,...,10.0,1,Beauty/Makeup/Lips,Anastasia Beverly Hills,0,22.0,beauty,makeup,lips,"(20, 25]"


In [6]:
df.shape

(50000, 34)

## Vectorize the Pre-processed Item_Description column using tfidf
- purpose: to quantifies the importance of a particular word relative to the a collection of documents

$$tfidf = tf(w) * idf(w)$$

where:

$$tf(w) = \frac{(number  \,\, of   \,\,times  \,\,the \,\, word  \,\,appears  \,\,in  \,\,the  \,\,document)}{(Total \,\,number\,\, of \,\,words \,\,in \,\,the \,\,document)}$$


$$idf(w) = log(\frac{Number \,\,of\,\,documents}{ number \,\,of \,\,documents \,\,that \,\,contain \,\,word \,\, w})$$







TF (Term Frequency) =  measure how frequently a term occurs in a document

IDF (Inverse DOcument Frequency) = measure how important a term is. Less important words have lower IDF values

Tfidfvectorizer systematically compute word counts, then compute the Inverse Document Frequency and then compute the TFIDF score

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [41]:
vectorizer = TfidfVectorizer(analyzer = "word",
                             max_features=10000,
                             ngram_range = (1,2)
                             )

In [42]:
vz = vectorizer.fit_transform(df['clean_item_description'].values).toarray()
vz.shape

(50000, 10000)

the rows of the vz matrix is the total records of item_description,

the column of the vz matrix is the features

In [43]:
feature_names = vectorizer.get_feature_names()

In [44]:
# make a df out of the vectorized text
ndf = pd.DataFrame(vz, columns = feature_names)

In [45]:
ndf.head()

Unnamed: 0,00,007,01,02,03,03 months,04,05,06,07,...,zip front,zip hood,zip jacket,zip plover,zip pocket,zip pockets,zip size,zip sweatshirt,zip top,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
ndf.shape

(50000, 10000)

In [15]:
# create a dictionary for each feature and its tfidf values
dictionary = dict(zip(feature_names, vectorizer.idf_))

In [16]:
feature_dict = pd.DataFrame.from_dict(dictionary,orient='index', columns= ['tfidf_score'])

In [17]:
feature_dict.sort_values(by = 'tfidf_score',ascending = False, inplace = True)

In [18]:
len(dictionary.keys())

10000

Visualize the top 10 features with the highest idf weights
- The higher the numerical weight value, the rarer the term
- The idf of a rare term is high, whereas the idf of a frequent term is likely to be low


In [None]:
feature_dict['tfidf_score'][:10]

In [None]:
plt.bar(height = feature_dict['tfidf_score'][:11].values, x = feature_dict.index[:11])

plt.title("top 10 features with the highest tfidf score".title(), fontweight= 'bold')

plt.xticks(rotation=90);

plt.ylabel("tfidf score".title(), fontsize = 12);

plt.show()

## Just use the top Linear Discriminant Analysis + TSNE
- takes very long to vectorize them

In [19]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [46]:
ndf.shape

(50000, 10000)

In [47]:
ndf['predict_variable_price'] = df['price']

ndf['predict_variable_category'] = df['c1']

In [48]:
ndf.shape

(50000, 10002)

In [58]:
# Define X and y
y_price = ndf.iloc[:,-2]
y_category = ndf.iloc[:,-1]
X = ndf.iloc[:,:-2]

In [None]:
# instantiate the classifier
clf = LDA(n_components = 2)

# clf fit on category
clf.fit_transform(X, y_category)

In [63]:
print(X.shape)
print(y_price.shape)

(50000, 10000)
(50000,)


## pca + tsne

In [None]:
os.chdir("/Users/zhiying/OneDrive - The City College of New York/DSE I2100 Applied ML and Data Mining/project/project-product-price-prediction")

In [None]:
from final.dimension_reduction.feature_reduction import dimension_reduction

In [None]:
pca_df = pca_df = dimension_reduction(ndf, "PCA", 10)

In [None]:
pca_df.shape

In [None]:
from sklearn.manifold import TSNE

random_state = 32

# instantiate
tsne = TSNE(n_components = 2, random_state = random_state, verbose=1,n_iter=250 )

# fit on data
tsne_tfidf = tsne.fit_transform(pca_df)

In [None]:
tsne_tfidf.shape

In [None]:
vis_df = pd.DataFrame(tsne_tfidf, columns=['x', 'y'])

In [None]:
vis_df['category'] = df['c1']
vis_df['price'] = df['price_bin']

In [None]:
vis_df

In [None]:

sns.scatterplot(data=vis_df, x='x', y='y', hue='category')

plt.title("Visualize Item description t-SNE \n stratify by category".title(), fontweight = "bold", fontsize = 20)

plt.xlabel("principal component 1".title(), fontweight = "bold", fontsize = 20)

plt.ylabel("principal component 2".title(), fontweight = "bold", fontsize = 20)

plt.show()

In [None]:

sns.scatterplot(data=vis_df, x='x', y='y', hue='price')

plt.title("Visualize Item description t-SNE \n stratify by price bin".title(), fontweight = "bold", fontsize = 20)

plt.xlabel("principal component 1".title(), fontweight = "bold", fontsize = 20)

plt.ylabel("principal component 2".title(), fontweight = "bold", fontsize = 20)

plt.show()