In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score



# Read data
df = pd.read_csv('/workspaces/machine-learning-python-template-ds-2023/Ryan/raw/playstore.csv')

# See info
print(df.info())

# See data
print(df.head())

# Check for duplicates
print(f'''Duplicated: {df.duplicated().sum()}''')
duplicates = df[df.duplicated()]
print(duplicates)
df.drop_duplicates(inplace=True)

# Convert text
df["review"] = df["review"].str.strip().str.lower()

# Split data
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['polarity'], test_size=0.2, random_state=42)

# Vectorize text data
vec_model = CountVectorizer(stop_words="english")
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()

# Create and fit the model
model = GaussianNB()
model.fit(X_train, y_train.ravel())

# Predict y
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Compare accuracy scores
print(accuracy_score(y_train, y_train_pred))
print(accuracy_score(y_test, y_test_pred))

# Create and fit the model
model2 = MultinomialNB()
model2.fit(X_train, y_train.ravel())

# Predict y
y_train_pred2 = model2.predict(X_train)
y_test_pred2 = model2.predict(X_test)

# Compare accuracy scores
print(accuracy_score(y_train, y_train_pred2))
print(accuracy_score(y_test, y_test_pred2))

# Create and fit the model
model3 = BernoulliNB()
model3.fit(X_train, y_train.ravel())

# Predict y
y_train_pred3 = model3.predict(X_train)
y_test_pred3 = model3.predict(X_test)

# Compare accuracy scores
print(accuracy_score(y_train, y_train_pred3))
print(accuracy_score(y_test, y_test_pred3))



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB
None
          package_name                                             review  \
0  com.facebook.katana   privacy at least put some option appear offli...   
1  com.facebook.katana   messenger issues ever since the last update, ...   
2  com.facebook.katana   profile any time my wife or anybody has more ...   
3  com.facebook.katana   the new features suck for those of us who don...   
4  com.facebook.katana   forced reload on uploading pic on replying co...   

   polarity  
0         0  
1         0  
2         0  
3         0  
4         0  
Duplicated: 0
Empty DataFrame
Columns: [package_name, review, polarity]
Index: []
0