In [1]:
#Import libraries
# import libraries
import pandas as pd
import numpy as np
import nltk
import google_play_scraper
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from google_play_scraper import app, reviews_all

In [2]:
#Collect Safaricom reviews
#Name of app package
app_name = 'com.safaricom.mysafaricom'
#Obtain app details
details_app = app(app_name,
                  lang='en',#Language
                  country='ke')#The country of interest
print(details_app)

{'title': 'MySafaricom', 'description': 'MySafaricom provides an easy and efficient way for customers to access Safaricom products and services under one roof. The app is available on both iOS and Android\r\n \r\nWith the MySafaricom App you can: \r\n \r\nM-PESA SEND MONEY- Select your contacts directly from your phone book, Hakikisha the name of the person you are transferring money to & check the transaction charges before hitting the send button.\r\n \r\nLIPA NA M-PESA – Shop and pay your bills with ease at your favorite stores \r\n \r\nTOP-UP - Recharge your number and your loved ones by selecting their number directly from your phone book.\r\n \r\nVIEW BALANCES - View your account balances for Data, Bonga, Airtime or Credit limit\r\n \r\nHOME FIBRE - Safaricom Home Fibre customers will be able to pay or change their Home Fibre plans', 'descriptionHTML': 'MySafaricom provides an easy and efficient way for customers to access Safaricom products and services under one roof. The app i

In [3]:
#Obtain all the reviews of the app
from google_play_scraper import Sort, reviews_all #Loading the necessary library
app_reviews = reviews_all(app_name,
                          sleep_milliseconds=0,
                          lang='en',#The language defaults to English
                          country='ke',#The country is Kenya
                          sort=Sort.NEWEST,#Obtain the newest reviews
                          filter_score_with=5)#Brings all scores


In [4]:
#Print the first 5
for review in app_reviews[:5]:
    print(review)

{'reviewId': '96e89d11-b1cc-4d2d-a263-4762a0f4e3d6', 'userName': 'A Google user', 'userImage': 'https://play-lh.googleusercontent.com/EGemoI2NTXmTsBVtJqk8jxF9rh8ApRWfsIMQSt2uE4OcpQqbFu7f7NbTK05lx80nuSijCz7sc3a277R67g', 'content': 'wow..surely this up sia very credible very important to use the efficiency of bundles and credit offers ,bundles are not expiring', 'score': 5, 'thumbsUpCount': 0, 'reviewCreatedVersion': '1.24.0.3', 'at': datetime.datetime(2024, 8, 18, 0, 44, 18), 'replyContent': None, 'repliedAt': None, 'appVersion': '1.24.0.3'}
{'reviewId': 'ada89e46-c791-4c34-87b0-beb37c448027', 'userName': 'A Google user', 'userImage': 'https://play-lh.googleusercontent.com/EGemoI2NTXmTsBVtJqk8jxF9rh8ApRWfsIMQSt2uE4OcpQqbFu7f7NbTK05lx80nuSijCz7sc3a277R67g', 'content': 'best ever', 'score': 5, 'thumbsUpCount': 0, 'reviewCreatedVersion': '1.24.0.3', 'at': datetime.datetime(2024, 8, 17, 22, 15, 14), 'replyContent': None, 'repliedAt': None, 'appVersion': '1.24.0.3'}
{'reviewId': 'e310a0b3-de

In [5]:
# Creating a function that extracts the review contents and then putting it into a dataframce
if isinstance(app_reviews, list) and all(isinstance(review, dict) for review in app_reviews):
    # Extract review texts
    review_texts = [review.get('content', '') for review in app_reviews]
    
    # Create a DataFrame with review texts
    saf_reviews = pd.DataFrame({
        'review': review_texts
    })
else:
    print("Unexpected data structure received from the reviews function.")

# Display the DataFrame
print(saf_reviews.head())

                                              review
0  wow..surely this up sia very credible very imp...
1                                          best ever
2                                         convenient
3                                               cool
4                                            mwangah


In [6]:
#Checking the shape of the data
saf_reviews.shape

(31042, 1)

In [7]:
#Proceed to Sentiment analysis
#Using the below preprocessing steps
#Data Preprocessing
# create preprocess_text function
def preprocess_of_text(text):

    # Tokenize the text

    tokens = word_tokenize(text.lower())




    # Remove stop words

    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]




    # Lemmatize the tokens

    lemmatizer = WordNetLemmatizer()

    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]




    # Join the tokens back into a string

    processed_text = ' '.join(lemmatized_tokens)

    return processed_text

# apply the function to the IMDB Dataset

saf_reviews['review'] = saf_reviews['review'].apply(preprocess_of_text)
saf_reviews.head()

Unnamed: 0,review
0,wow .. surely sia credible important use effic...
1,best ever
2,convenient
3,cool
4,mwangah


In [10]:
#We now use an NLTK Sentiment analyzer as follows
# initialize NLTK sentiment analyzer

analyzer = SentimentIntensityAnalyzer()


# create get_sentiment function

def get_the_sentiment(text):

    scores = analyzer.polarity_scores(text)

    sentiment = 'positive' if scores['pos'] >0 else 'negative'

    return sentiment

# apply get_sentiment function

saf_reviews['sentiment'] = saf_reviews['review'].apply(get_the_sentiment)

saf_reviews.head(20)

Unnamed: 0,review,sentiment
0,wow .. surely sia credible important use effic...,positive
1,best ever,positive
2,convenient,negative
3,cool,positive
4,mwangah,negative
5,"app good use , ,",positive
6,excellent,positive
7,efficient,positive
8,awesome,positive
9,app really great.love it❤❤❤,negative


In [11]:
#Load libraries for Bag of words model and classification
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [12]:
#View the data
new_data = saf_reviews['review']
new_data.head()

0    wow .. surely sia credible important use effic...
1                                            best ever
2                                           convenient
3                                                 cool
4                                              mwangah
Name: review, dtype: object

In [13]:
#Building a Bag of Words Model using already loaded vectorizer
import joblib
import pickle
# Load the pre-trained vectorizer from the file
with open('tfidf1_vectorizer.pkl', 'rb') as vectorizer_file:
    tfidf_vec1 = pickle.load(vectorizer_file)



In [14]:
# Transform the preprocessed new data using the loaded vectorizer
# Transform the new data using the loaded vectorizer
new_data_vectorized = tfidf_vec1.transform(new_data)
new_data_vectorized
print(new_data_vectorized.shape) 

(31042, 3614)


In [15]:
# View the transformed data (sparse matrix)
print(new_data_vectorized)

  (0, 3570)	0.29267789781119585
  (0, 3410)	0.2042877388411633
  (0, 3138)	0.3505616071884366
  (0, 2174)	0.2611075835091962
  (0, 1600)	0.2650592805685046
  (0, 746)	0.2533146738671092
  (0, 427)	0.7396693930936573
  (1, 1090)	0.748418795987716
  (1, 315)	0.6632264363038446
  (2, 693)	1.0
  (3, 701)	1.0
  (5, 3410)	0.713592390485504
  (5, 1392)	0.49167617482314696
  (5, 166)	0.49903951682453157
  (6, 1108)	1.0
  (7, 1008)	1.0
  (8, 248)	1.0
  (9, 2559)	0.38454869936673786
  (9, 1894)	0.34643355136927667
  (9, 1709)	0.7074106353699188
  (9, 1411)	0.3421677290657771
  (9, 166)	0.3385225700784481
  (10, 701)	1.0
  (11, 315)	0.7824420117514227
  (11, 166)	0.6227234524621555
  :	:
  (31035, 1392)	0.576768647348741
  (31035, 166)	0.5854063341506943
  (31036, 3445)	0.309158190516051
  (31036, 3032)	0.28768654995309534
  (31036, 2185)	0.34330743158091853
  (31036, 2072)	0.27024784324846146
  (31036, 1909)	0.265775128310479
  (31036, 1375)	0.4061185470129078
  (31036, 836)	0.5121646394813112
 

In [16]:
# Load the saved model 
model_path = 'modelslg1/model_lg1.pkl'
loaded_model = joblib.load(model_path)
print(loaded_model)

LogisticRegression(max_iter=1000)


In [17]:
# Make predictions using the loaded model
predictions = loaded_model.predict(new_data_vectorized)
predictions

array(['Positive', 'Positive', 'Neutral', ..., 'Positive', 'Positive',
       'Positive'], dtype=object)

In [19]:
# Map predictions to labels
def map_sentiment(label):
    return "Negative" if label == 0 else "Positive"

saf_reviews['sentiment'] = [map_sentiment(pred) for pred in predictions]

# Print the DataFrame to view the reviews and their sentiments
print(saf_reviews[['review', 'sentiment']])

                                                  review sentiment
0      wow .. surely sia credible important use effic...  Positive
1                                              best ever  Positive
2                                             convenient  Positive
3                                                   cool  Positive
4                                                mwangah  Positive
...                                                  ...       ...
31037          's better version compared previous one .  Positive
31038                                 's good app like .  Positive
31039                                      really nice .  Positive
31040                                               good  Positive
31041                                               good  Positive

[31042 rows x 2 columns]


In [20]:
#Saving it to a CSV File
saf_reviews.to_csv('Safaricomreviewsfinal.csv', index=False)