In [1]:
import pandas as pd

In [2]:
# Reading the CSV file 'android-reviews.csv' into a pandas DataFrame and assigning it to a variable review

review = pd.read_csv('android-reviews.csv')

# Renaming the column 'text' to 'review' using the rename() function. 
# inplace=False ensures that the original DataFrame is not modified, and the result is stored in a new DataFrame

review = review.rename(columns = {'text': 'review'}, inplace = False)

#Displaying the first few rows to verify changes

review.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [6]:
from sklearn.model_selection import train_test_split 

X = review.review
y = review.polarity

#Splitting the data into training and testing sets 
# Splitting the data into training and testing sets using train_test_split() function
# X_train: Features for training set, X_test: Features for testing set
# y_train: Target values for training set, y_test: Target values for testing set
# train_size=0.8 specifies that 80% of the data will be used for training, and 20% for testing
# random_state=1 ensures reproducibility of the results

X_train, X_test, y_train, y_test = train_test_split(X,y,train_size = 0.8, random_state = 1)

In [15]:
# Importing necessary modules from scikit-learn

from sklearn.pipeline import Pipeline  # Module for creating a pipeline for machine learning models
from sklearn.feature_extraction.text import TfidfVectorizer  # Module for converting text data into numerical vectors
from sklearn.linear_model import LogisticRegression  # Module for logistic regression classifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report  # Modules for evaluation metrics


# Defining a model pipeline

model_pipeline_lr = Pipeline([
    ('tfidf', TfidfVectorizer()),  # Step 1: Text preprocessing with TF-IDF Vectorizer
    ('lr', LogisticRegression())   # Step 2: Classification with Logistic Regression
])


#Training the model

model_pipeline_lr.fit(X_train, y_train)


# Predict on the test set

y_pred_lr = model_pipeline_lr.predict(X_test)


# Evaluating  the performance of the model

accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr, average='weighted')
recall_lr = recall_score(y_test, y_pred_lr, average='weighted')
f1_score_lr = f1_score(y_test, y_pred_lr, average='weighted')
classification_report_lr = classification_report(y_test, y_pred_lr)


# Print evaluation metrics

print("Evaluation Metrics for Logistic Regression Model")
print("------------------------------------------------")
print(classification_report_lr)


Evaluation Metrics for Logistic Regression Model
------------------------------------------------
              precision    recall  f1-score   support

           0       0.71      0.96      0.82       114
           1       0.84      0.32      0.47        65

    accuracy                           0.73       179
   macro avg       0.78      0.64      0.64       179
weighted avg       0.76      0.73      0.69       179



In [16]:
#saving the trained regression model to sentiment_model.pkl

from joblib import dump
dump(model_pipeline_lr, 'sentiment-model.pkl')

['sentiment-model.pkl']