# Explore here

In [4]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
main_dataset = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv")

main_dataset.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [11]:
main_dataset['polarity'].value_counts()

polarity
0    584
1    307
Name: count, dtype: int64

### Some common string methods

##### Splitting based on whitespace using the `.split()` method

##### Converting to lowercase using the `.lower()` method

In [36]:
my_text = "I am typing up some text and hopefully this example makes sense"

my_text.split()

['I',
 'am',
 'typing',
 'Up',
 'sOme',
 'tExt',
 'and',
 'hOpEfUlLy',
 'this',
 'example',
 'makes',
 'sense']

In [37]:
my_2text = "I am typing Up sOme tExt and hOpEfUlLy this example makes sense"

my_2text.lower()

'i am typing up some text and hopefully this example makes sense'

### Performing these string operations on a column

It is *almost* straight-forward to apply these same methods to an entire Pandas column, except we need to make a slight change: `.split()` $\Rightarrow$ `.str.split()`

In [44]:
main_dataset['review'].str.split()

0      [privacy, at, least, put, some, option, appear...
1      [messenger, issues, ever, since, the, last, up...
2      [profile, any, time, my, wife, or, anybody, ha...
3      [the, new, features, suck, for, those, of, us,...
4      [forced, reload, on, uploading, pic, on, reply...
                             ...                        
886    [loved, it, i, loooooooooooooovvved, it, becau...
887    [all, time, legendary, game, the, birthday, pa...
888    [ads, are, way, to, heavy, listen, to, the, ba...
889    [fun, works, perfectly, well., ads, aren't, as...
890    [they're, everywhere, i, see, angry, birds, ev...
Name: review, Length: 891, dtype: object

In [43]:
main_dataset['review'].str.lower()

0       privacy at least put some option appear offli...
1       messenger issues ever since the last update, ...
2       profile any time my wife or anybody has more ...
3       the new features suck for those of us who don...
4       forced reload on uploading pic on replying co...
                             ...                        
886     loved it i loooooooooooooovvved it because it...
887     all time legendary game the birthday party le...
888     ads are way to heavy listen to the bad review...
889     fun works perfectly well. ads aren't as annoy...
890     they're everywhere i see angry birds everywhe...
Name: review, Length: 891, dtype: object

## Let's create a Feature Matrix for word frequencies

In [45]:
from sklearn.feature_extraction.text import CountVectorizer

In [61]:
vec_model = CountVectorizer(stop_words='english',min_df=10)

X = vec_model.fit_transform(main_dataset['review']).toarray()

In [67]:
#vec_model.get_feature_names_out()

In [69]:
X_df = pd.DataFrame(X)

X_df.columns = [i for i in vec_model.get_feature_names_out()]

X_df.iloc[:, 200:220]

Unnamed: 0,notification,notifications,number,offline,ok,old,ones,online,open,opera,option,options,overall,page,pages,password,pc,people,perfect,person
0,0,0,0,1,0,0,0,2,0,0,1,0,0,0,0,0,0,1,0,0
1,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
887,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
888,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
889,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [66]:
main_dataset['review'][887]

" all time legendary game the birthday party levels and short fuse levels are fantastic.especially when the pigs crash onto different chemicals is just great.suggestion to all those players who cringe about too much ads is close ur wi-fi connection and then play the game.then the ads won't trouble you."

### Let's split into training and testing data, per usual

In [70]:
X = main_dataset['review']

y = main_dataset['polarity']

In [73]:
from sklearn.model_selection import train_test_split

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=1004)

#### `fit_transform()` on the X_train dataset, and `transform()` on the X_test dataset

In [86]:
word_vector_model = CountVectorizer(stop_words='english', min_df=10)

X_vector_train = word_vector_model.fit_transform(X_train).toarray()

X_vector_test = word_vector_model.transform(X_test)

In [87]:
from sklearn.naive_bayes import MultinomialNB

In [88]:
nb_model = MultinomialNB()

nb_model.fit(X_vector_train, y_train)

y_train_preds = nb_model.predict(X_vector_train)
y_test_preds = nb_model.predict(X_vector_test)

In [89]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [90]:
print(f"Training accuracy: {accuracy_score(y_train, y_train_preds)}")

print(f"Testing accuracy: {accuracy_score(y_test, y_test_preds)}")

Training accuracy: 0.8547904191616766
Testing accuracy: 0.7847533632286996


In [92]:
print("Training data:")
print(confusion_matrix(y_train, y_train_preds))
print("\nTesting data:")
print(confusion_matrix(y_test, y_test_preds))

Training data:
[[385  58]
 [ 39 186]]

Testing data:
[[115  26]
 [ 22  60]]


## Let's try a Random Forest classifier

Because why not?

In [93]:
from sklearn.ensemble import RandomForestClassifier

In [98]:
rfc_model = RandomForestClassifier(max_depth=8)

rfc_model.fit(X_vector_train, y_train)

rfc_trainpreds = rfc_model.predict(X_vector_train)
rfc_testpreds = rfc_model.predict(X_vector_test)

In [99]:
print(f"Training accuracy: {accuracy_score(y_train, rfc_trainpreds)}")

print(f"Testing accuracy: {accuracy_score(y_test, rfc_testpreds)}")

Training accuracy: 0.7919161676646707
Testing accuracy: 0.7533632286995515
