In [2]:
from warnings import filterwarnings
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
from PIL import Image
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate
from sklearn.preprocessing import LabelEncoder
from textblob import Word, TextBlob
from wordcloud import WordCloud

In [3]:
df=pd.read_csv("amazon_reviews.csv")

In [4]:
df["reviewText_raw"] = df["reviewText"]  # Ham metni saklayalım, ileride kıyaslama için bakalım


In [5]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,day_diff,helpful_yes,total_vote,reviewText_raw
0,A3SBTW3WS4IQSN,B007WTAJTO,,"[0, 0]",No issues.,4.0,Four Stars,1406073600,2014-07-23,138,0,0,No issues.
1,A18K1ODH1I2MVB,B007WTAJTO,0mie,"[0, 0]","Purchased this for my device, it worked as adv...",5.0,MOAR SPACE!!!,1382659200,2013-10-25,409,0,0,"Purchased this for my device, it worked as adv..."
2,A2FII3I2MBMUIA,B007WTAJTO,1K3,"[0, 0]",it works as expected. I should have sprung for...,4.0,nothing to really say....,1356220800,2012-12-23,715,0,0,it works as expected. I should have sprung for...
3,A3H99DFEG68SR,B007WTAJTO,1m2,"[0, 0]",This think has worked out great.Had a diff. br...,5.0,Great buy at this price!!! *** UPDATE,1384992000,2013-11-21,382,0,0,This think has worked out great.Had a diff. br...
4,A375ZM4U047O79,B007WTAJTO,2&amp;1/2Men,"[0, 0]","Bought it with Retail Packaging, arrived legit...",5.0,best deal around,1373673600,2013-07-13,513,0,0,"Bought it with Retail Packaging, arrived legit..."


In [6]:
df["word_count_before"] = df["reviewText"].apply(lambda x: len(str(x).split()))
df["word_count_before"].head()

0     2
1    31
2    31
3    66
4    52
Name: word_count_before, dtype: int64

In [7]:
#Normalizing Case Folding

df["reviewText"]=df["reviewText"].str.lower()

#Punctuations
df["reviewText"]=df["reviewText"].str.replace("[^\w\s]"," ",regex=True)

#Numbers
df["reviewText"]=df["reviewText"].str.replace("\d"," ")

df[["reviewText","reviewText_raw"]].head()

Unnamed: 0,reviewText,reviewText_raw
0,no issues,No issues.
1,purchased this for my device it worked as adv...,"Purchased this for my device, it worked as adv..."
2,it works as expected i should have sprung for...,it works as expected. I should have sprung for...
3,this think has worked out great had a diff br...,This think has worked out great.Had a diff. br...
4,bought it with retail packaging arrived legit...,"Bought it with Retail Packaging, arrived legit..."


In [8]:
#Stop Words
import nltk

sw=stopwords.words("english")
df["reviewText"]=df["reviewText"].apply(lambda x: " ".join(x for x in str(x).split() if x not in sw))
df[["reviewText","reviewText_raw"]].head()

Unnamed: 0,reviewText,reviewText_raw
0,issues,No issues.
1,purchased device worked advertised never much ...,"Purchased this for my device, it worked as adv..."
2,works expected sprung higher capacity think ma...,it works as expected. I should have sprung for...
3,think worked great diff bran 64gb card went so...,This think has worked out great.Had a diff. br...
4,bought retail packaging arrived legit orange e...,"Bought it with Retail Packaging, arrived legit..."


In [9]:
#Tokenization
#nltk.download("punkt")
from textblob import TextBlob
df["reviewText"].apply(lambda x: TextBlob(x).words).head()
df[["reviewText","reviewText_raw"]].head()

Unnamed: 0,reviewText,reviewText_raw
0,issues,No issues.
1,purchased device worked advertised never much ...,"Purchased this for my device, it worked as adv..."
2,works expected sprung higher capacity think ma...,it works as expected. I should have sprung for...
3,think worked great diff bran 64gb card went so...,This think has worked out great.Had a diff. br...
4,bought retail packaging arrived legit orange e...,"Bought it with Retail Packaging, arrived legit..."


In [10]:
#Lemmatization
#nltk.download("wordnet")

df["reviewText"]= df["reviewText"].apply(lambda x : " ".join([Word(word).lemmatize() for word in x.split()]))
df[["reviewText","reviewText_raw"]].head()

Unnamed: 0,reviewText,reviewText_raw
0,issue,No issues.
1,purchased device worked advertised never much ...,"Purchased this for my device, it worked as adv..."
2,work expected sprung higher capacity think mad...,it works as expected. I should have sprung for...
3,think worked great diff bran 64gb card went so...,This think has worked out great.Had a diff. br...
4,bought retail packaging arrived legit orange e...,"Bought it with Retail Packaging, arrived legit..."


In [11]:
for i in range(3):  # İlk 3 örneği karşılaştır
    print(f"Orijinal: {df['reviewText_raw'][i]}")
    print(f"İşlenmiş: {df['reviewText'][i]}")
    print("-"*50)

Orijinal: No issues.
İşlenmiş: issue
--------------------------------------------------
Orijinal: Purchased this for my device, it worked as advertised. You can never have too much phone memory, since I download a lot of stuff this was a no brainer for me.
İşlenmiş: purchased device worked advertised never much phone memory since download lot stuff brainer
--------------------------------------------------
Orijinal: it works as expected. I should have sprung for the higher capacity.  I think its made a bit cheesier than the earlier versions; the paint looks not as clean as before
İşlenmiş: work expected sprung higher capacity think made bit cheesier earlier version paint look clean
--------------------------------------------------


In [12]:
#Stemming

from nltk.stem import PorterStemmer

ps = PorterStemmer()
df["reviewText"] = df["reviewText"].apply(lambda x: " ".join([ps.stem(word) for word in x.split()]))
df[["reviewText","reviewText_raw"]].head()

Unnamed: 0,reviewText,reviewText_raw
0,issu,No issues.
1,purchas devic work advertis never much phone m...,"Purchased this for my device, it worked as adv..."
2,work expect sprung higher capac think made bit...,it works as expected. I should have sprung for...
3,think work great diff bran 64gb card went sout...,This think has worked out great.Had a diff. br...
4,bought retail packag arriv legit orang envelop...,"Bought it with Retail Packaging, arrived legit..."


In [13]:
df["word_count_after"] = df["reviewText"].apply(lambda x: len(str(x).split()))
df["word_count_after"].head()

0     1
1    13
2    14
3    34
4    35
Name: word_count_after, dtype: int64

In [14]:
print("Ön işleme öncesi toplam kelime sayısı:", df["word_count_before"].sum())
print("Ön işleme sonrası toplam kelime sayısı:", df["word_count_after"].sum())

print("\nişlem öncesi ortalama kelime sayısı:",df["word_count_before"].mean())
print("\nişlem sonrası ortalama kelime sayısı:",df["word_count_after"].mean())

print("\nOrtalama kelime sayısı farkı:", df["word_count_before"].mean() - df["word_count_after"].mean())

Ön işleme öncesi toplam kelime sayısı: 247925
Ön işleme sonrası toplam kelime sayısı: 130093

işlem öncesi ortalama kelime sayısı: 50.442522889114954

işlem sonrası ortalama kelime sayısı: 26.468565615462868

Ortalama kelime sayısı farkı: 23.973957273652086


In [15]:
df[["reviewText", "word_count_before", "word_count_after"]].sample(5)  # Rastgele 5 örnek seç

Unnamed: 0,reviewText,word_count_before,word_count_after
3575,perfect everi sens use mainli access music sin...,37,20
1017,good qualiti product price bought go pro issu yet,20,9
1712,use samsung galaxi s5 store photo video data s...,28,15
3118,32gb microsdhc card replac 16gb card order eas...,24,16
336,work perfectli samsung galaxi tab 3 new cell p...,22,16


In [16]:
for i in range(3):  # İlk 3 örneği karşılaştır
    print(f"Orijinal: {df['reviewText_raw'][i]}")
    print(f"İşlenmiş: {df['reviewText'][i]}")
    print("-"*50)

Orijinal: No issues.
İşlenmiş: issu
--------------------------------------------------
Orijinal: Purchased this for my device, it worked as advertised. You can never have too much phone memory, since I download a lot of stuff this was a no brainer for me.
İşlenmiş: purchas devic work advertis never much phone memori sinc download lot stuff brainer
--------------------------------------------------
Orijinal: it works as expected. I should have sprung for the higher capacity.  I think its made a bit cheesier than the earlier versions; the paint looks not as clean as before
İşlenmiş: work expect sprung higher capac think made bit cheesier earlier version paint look clean
--------------------------------------------------
