In [1]:
# Women's E-commerce Clothing Reviews

In [None]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv('./kaggle.csv',index_col =[0])

# check data values
data.isnull().sum()/len(data)*100
data.info()

# drop unwanted columns
data.drop(labels =['Clothing ID','Title'],axis = 1,inplace = True) #Dropping unwanted columns
data[data['Review Text'].isnull()]


In [None]:
# 항목별 data distribution 확인
import plotly.express as px
px.histogram(data, x = 'Age')
px.histogram(data, x = data['Rating'])
px.histogram(data, x = data['Class Name'])
px.scatter(data, x="Age", y="Positive Feedback Count", facet_row="Recommended IND", facet_col="Rating",trendline="ols",category_orders={"Rating": [1,2,3,4,5],'Recommended IND':[0,1]})
# violin plot -> 연속형 데이터에 대한 box plot이라고 생각
px.violin(data, x="Age", y="Department Name", orientation="h", color="Recommended IND")
px.box(data, x="Age", y="Division Name", orientation="h",color = 'Recommended IND')


In [None]:
# cleagning text data
err1 = data['Review Text'].str.extractall("(&amp)")
err2 = data['Review Text'].str.extractall("(\xa0)")
print('with &amp',len(err1[~err1.isna()]))
print('with (\xa0)',len(err2[~err2.isna()]))

data['Review Text'] = data['Review Text'].str.replace('(&amp)','')
data['Review Text'] = data['Review Text'].str.replace('(\xa0)','')
err1 = data['Review Text'].str.extractall("(&amp)")
print('with &amp',len(err1[~err1.isna()]))
err2 = data['Review Text'].str.extractall("(\xa0)")
print('with (\xa0)',len(err2[~err2.isna()]))

In [None]:
#!pip install TextBlob
#from textblob import *
'''
TextBlob: NTTK와 더불어 많이 쓰이는 텍스트 전처리 라이브러리
sentiment하의 polarity는 문장을 부정, 긍정 정도에 따라 -1과 1사이의 값으로 표현한다.
'''
#-> 문장을 빠르게 감성분석할 수 있음
data['polarity'] = data['Review Text'].map(lambda text: TextBlob(text).sentiment.polarity)
data['polarity']
px.histogram(data, x = 'polarity')

px.box(data, y="polarity", x="Department Name", orientation="v",color = 'Recommended IND')


In [None]:
# 문장 길이 기준 eda
data['review_len'] = data['Review Text'].astype(str).apply(len)
px.histogram(data, x = 'review_len')

data['token_count'] = data['Review Text'].apply(lambda x: len(str(x).split()))
px.histogram(data, x = 'token_count')


In [None]:
# examples

# positive polarity
sam = data.loc[data.polarity == 1,['Review Text']].sample(3).values
for i in sam:
    print(i[0])
# neutral polarity
sam = data.loc[data.polarity == 0.5,['Review Text']].sample(3).values
for i in sam:
    print(i[0])
# negative polarity
sam = data.loc[data.polarity < 0,['Review Text']].sample(3).values
for i in sam:
    print(i[0])

negative = (len(data.loc[data.polarity <0,['Review Text']].values)/len(data))*100
positive = (len(data.loc[data.polarity >0.5,['Review Text']].values)/len(data))*100
neutral  = len(data.loc[data.polarity >0 ,['Review Text']].values) - len(data.loc[data.polarity >0.5 ,['Review Text']].values)
neutral = neutral/len(data)*100

In [None]:
# correlation heatmap
import seaborn as sns
sns.heatmap(X.corr(),annot =True) # column correlation 계산

# multi-colinearity
set1 =set()
cor = X.corr()
for i in cor.columns:
    for j in cor.columns:
        if cor[i][j]>0.8 and i!=j:
            set1.add(i)
print(set1)

X = X.drop(labels = ['token_count'],axis = 1)
X.corr()

class1 =[]
for i in X.polarity:
    if float(i)>=0.0:
        class1.append(1)
    elif float(i)<0.0:
        class1.append(0)
X['sentiment'] = class1

In [None]:
# model

import nltk
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

corpus =[]
X.index = np.arange(len(X))

# preprocessing, 
# re + tokenizing + stemming + corpus creation
for i in range(len(X)):
    review = re.sub('[^a-zA-z]',' ',X['Review Text'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review =[ps.stem(i) for i in review if not i in set(stopwords.words('english'))]
    review =' '.join(review)
    corpus.append(review)
    
# BOW 카운트 기반 언어모델
from sklearn.feature_extraction.text import CountVectorizer as CV
cv  = CV(max_features = 3000,ngram_range=(1,1))
X_cv = cv.fit_transform(corpus).toarray()
y = y.values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_cv, y, test_size = 0.20, random_state = 0)
from sklearn.naive_bayes import BernoulliNB
classifier = BernoulliNB()
classifier.fit(X_train, y_train)


y_pred = classifier.predict(X_test)
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred)

In [None]:
# TF IDF

from sklearn.feature_extraction.text import TfidfVectorizer as TV
tv  = TV(ngram_range =(1,1),max_features = 3000)
X_tv = tv.fit_transform(corpus).toarray()
X_train, X_test, y_train, y_test = train_test_split(X_tv, y, test_size = 0.20, random_state = 0)
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
acc = accuracy_score(y_test, y_pred)

In [None]:
# deep learning model

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words = 3000)
tokenizer.fit_on_texts(corpus)
sequences = tokenizer.texts_to_sequences(corpus)
padded = pad_sequences(sequences, padding='post')
word_index = tokenizer.word_index
count = 0
for i,j in word_index.items():
    if count == 11:
        break
    print(i,j)
    count = count+1
    
 # Embedding + pooling + MLP   
embedding_dim = 64
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(3000, embedding_dim),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.summary() 
num_epochs = 10

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(padded,y,epochs= num_epochs)