In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
data = pd.read_csv('Datasets/google_play_store_apps_reviews_training.csv')
print(data.head())

          package_name                                             review  \
0  com.facebook.katana   privacy at least put some option appear offli...   
1  com.facebook.katana   messenger issues ever since the last update, ...   
2  com.facebook.katana   profile any time my wife or anybody has more ...   
3  com.facebook.katana   the new features suck for those of us who don...   
4  com.facebook.katana   forced reload on uploading pic on replying co...   

   polarity  
0         0  
1         0  
2         0  
3         0  
4         0  


In [2]:
#Pre-process Data
def preprocess_data(data):
    # Remove package name as it's not relevant
    data = data.drop('package_name', axis=1)
    # Convert text to lowercase
    data['review'] = data['review'].str.strip().str.lower()
    return data
data = preprocess_data(data)


x = data['review']
y = data['polarity']

In [3]:
#Vectorize text reviews to numbers
vec = CountVectorizer(stop_words='english')
x = vec.fit_transform(x).toarray()
# Split into training and testing data
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.25, random_state=42)

In [4]:
#Model Generation
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
print(model.fit(x, y))
y_pred =model.predict(x_test)
model.score(x_test, y_test)

MultinomialNB()


0.9327354260089686

In [5]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
model.predict(vec.transform(['Love this app simply awesome!']))

[[152   6]
 [  9  56]]
              precision    recall  f1-score   support

           0       0.94      0.96      0.95       158
           1       0.90      0.86      0.88        65

    accuracy                           0.93       223
   macro avg       0.92      0.91      0.92       223
weighted avg       0.93      0.93      0.93       223



array([1], dtype=int64)

The confusion matrix and classification report generated by the code indicate that the model has a high overall accuracy of 93%, with an accuracy of 94% for negative reviews (polarity 0) and 90% for positive reviews (polarity 1).

The confusion matrix shows that out of the 158 negative reviews in the test set, the model correctly predicted 152 as negative and incorrectly predicted 6 as positive. Out of the 65 positive reviews in the test set, the model correctly predicted 56 as positive and incorrectly predicted 9 as negative.

The precision for negative reviews is 94% and for positive reviews is 90%, which means that when the model predicts a review as negative or positive, it is correct 94% and 90% of the time, respectively. 

The recall for negative reviews is 96% and for positive reviews is 86%, which means that the model correctly identifies 96% and 86% of the negative and positive reviews, respectively. 

The F1-score for negative reviews is 95% and for positive reviews is 88%, which is a weighted harmonic mean of precision and recall

In [6]:
#Predict for Custom Input:
def expression_check(prediction_input):
    if prediction_input == 0:
        print("Input statement has Negative Sentiment.")
    elif prediction_input == 1:
        print("Input statement has Positive Sentiment.")
    else:
        print("Invalid Statement.")


In [7]:
# function to take the input statement and perform the same transformations we did earlier
def sentiment_predictor(input):
    transformed_input = vec.transform(input)
    prediction = model.predict(transformed_input)
    expression_check(prediction)

input1 = ["Worst support. I have given 2 stars because changes to be done."]
input2 = ["Its working not nicely."]


sentiment_predictor(input1)
sentiment_predictor(input2)


Input statement has Negative Sentiment.
Input statement has Positive Sentiment.
