In [None]:
!pip install wordcloud

In [None]:
import pandas as pd
from wordcloud import WordCloud, STOPWORDS

In [None]:
df = pd.read_csv("GrammarandProductReviews.csv/GrammarandProductReviews.csv")
df.head(5)

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.dtypes


In [None]:
df.count()

In [None]:
df.drop(["id", "ean", "manufacturerNumber", "dateAdded", "dateUpdated" , "keys", "reviews.dateAdded", "reviews.dateSeen", "reviews.userCity", 
"reviews.userProvince"], axis=1, inplace=True)

In [None]:
df.drop(["reviews.id", "reviews.numHelpful", "upc"], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df.nunique()

In [None]:
df.drop(["reviews.username"], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.value_counts().head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
dfl = df.groupby(["reviews.rating","reviews.didPurchase"]).count()['reviews.text'].reset_index().sort_values(by = 'reviews.rating', ascending=False)
dfl

In [None]:
import plotly.express as px
fig = px.bar(dfl, x="reviews.rating", y="reviews.text",
             color="reviews.didPurchase", barmode="group", title="Common Words in Text", orientation='v',  labels={
                     "reviews.rating": "Ratings",
                     "reviews.text": "Number of Reviews",
                     "reviews.didPurchase": "Product Purchased"
                 })
fig.show()

In [None]:
df = df.dropna(subset=['reviews.text'])

In [None]:
cleaned_df = df.loc[df["reviews.title"].notnull()]["reviews.title"] 
                                                     
cleaned_df.head()

In [None]:
text2 = " ".join(title for title in cleaned_df)
stopwords = set(STOPWORDS)

# Creating word_cloud with text as argument in .generate() method

word_cloud2 = WordCloud(collocations = False,
                        background_color = 'white',
                        stopwords=stopwords,
                        max_words=200,
                        max_font_size=40, 
                        scale=2,
                        random_state=1
                       ).generate(text2)

# Display the generated Word Cloud
fig = plt.figure(1, figsize=(15, 15))
plt.imshow(word_cloud2, interpolation='bilinear')

plt.axis("off")

plt.show()

In [None]:
text2 = " ".join(title for title in df["reviews.text"])
stopwords = set(STOPWORDS)

# Creating word_cloud with text as argument in .generate() method

word_cloud2 = WordCloud(collocations = False,
                        background_color = 'white',
                        stopwords=stopwords,
                        max_words=200,
                        max_font_size=40, 
                        scale=3,
                        random_state=1
                       ).generate(text2)

# Display the generated Word Cloud
fig = plt.figure(1, figsize=(15, 15))
plt.imshow(word_cloud2, interpolation='bilinear')

plt.axis("off")

plt.show()

In [None]:
dfw = df.groupby(["reviews.rating","reviews.doRecommend"]).count()['reviews.text'].reset_index().sort_values(by = 'reviews.rating', ascending=False)
dfw

In [None]:
fig = px.bar(dfw, x="reviews.rating", y="reviews.text",
             color="reviews.doRecommend", barmode="group", title="Common Words in Text", orientation='v',  labels={
                     "reviews.rating": "Ratings",
                     "reviews.text": "Number of Reviews",
                     "reviews.doRecommend": "Recommended"
                 })
fig.show()

In [None]:
df_brands = df.groupby(["brand"]).count()['reviews.text'].reset_index().sort_values(by = 'reviews.text', ascending=False)
df_brands

In [None]:
values = df_brands[df_brands["reviews.text"]<20]["brand"]


In [None]:
dft = df[df["brand"].isin(values) == False]

In [None]:
df_brands_updated = dft.groupby(["brand"]).count()['reviews.text'].reset_index().sort_values(by = 'reviews.text', ascending=False)

In [None]:
df_brands_updated

In [None]:
dft.categories.unique()

In [None]:
dft.nunique()

In [None]:
df_b = dft.groupby("brand").mean().reset_index().sort_values(by= "reviews.rating", ascending=False)
df_b

In [None]:
fig = px.bar(df_b, x="reviews.rating", y="brand",
  title="Common Words in Text", orientation='h',  labels={
                     "reviews.rating": "Average Ratings",
                     "brand": "Brands",
                 })
fig.show()

In [None]:
dft_food=dft[dft.categories.str.startswith("Food")]
dft_food.brand.nunique()

In [None]:
dft_food_g = dft_food.groupby(["brand"]).mean().reset_index().sort_values(by= "reviews.rating", ascending=False)
dft_food_g

In [None]:
fig = px.bar(dft_food_g, x="reviews.rating", y="brand",
  title="Common Words in Text", orientation='h',  labels={
                     "reviews.rating": "Average Ratings",
                     "brand": "Brands",
                 })
fig.show()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

In [None]:
df["reviews.text"] = df["reviews.text"] + " " + df["reviews.title"].astype(str)

In [None]:
df.iloc[7]["reviews.text"]


In [None]:
text_data = df["reviews.text"]
target = df["reviews.rating"]

In [None]:
vectorizer_word = TfidfVectorizer(
            sublinear_tf=True,
            strip_accents= "unicode",
            stop_words="english",
            ngram_range= (1,1),
            max_features= 1000)

vectorizer.fit(text_data)
feature_vectors_word = vectorizer.transform(text_data)


In [None]:
vectorizer_char = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(2, 4),
    max_features=10000)
vectorizer_char.fit(text_data)
feature_vectors_char = vectorizer_char.transform(text_data)


feature_vectors =  hstack([feature_vectors_word, feature_vectors_char])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(feature_vectors, target,test_size=0.2)

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
# model.fit(X_train, y_train)

# predictions = model.predict(X_test)



In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# Create the grid parameter:
grid_rf = {'n_estimators': [100, 120, 150],
           'criterion': ['entropy', 'gini'], 
           'max_depth': [None,1,3,5,7,9],
           'max_features': range(1,11),  
           'min_samples_split': range(2, 10),
           'min_samples_leaf': [1,3,5]}

# Create the grid:
gs_rf = GridSearchCV(model, grid_rf, cv=3, n_jobs=-1)

# Fit using grid search:
gs_rf.fit(X_train, y_train)

# Print best accuracy and best parameters:
print('Best accuracy: %.3f' % gs_rf.best_score_)
print('\nBest params:\n', gs_rf.best_params_)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_random_forest = accuracy_score(predictions,y_test)
print(accuracy_random_forest)