In [1]:
! pip install textblob



In [2]:
!pip install pyspellchecker



In [3]:
!pip install vaderSentiment



In [4]:
!pip install wordninja



In [5]:
!pip install emoji



In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('wordnet')

import string
import re
import emoji
import wordninja
import os
import sys
import ast

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from spellchecker import SpellChecker
from collections import Counter
from IPython.display import display, HTML
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import plotly.graph_objects as go

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
tt = pd.read_csv('./tiktok_google_play_reviews.csv', low_memory=False)

In [8]:
tt.rename(columns={'content': 'reviews', 'score': 'rating', 'thumbsUpCount':'likes','reviewCreatedVersion': 'appversion', 'at': 'timestamp',}
          , inplace=True)

In [9]:
tt.drop(columns=['reviewId' ,'userName','userImage','replyContent','repliedAt'], inplace=True)
print(tt.columns.values)

['reviews' 'rating' 'likes' 'appversion' 'timestamp']


In [10]:
tt['timestamp'] = pd.to_datetime(tt['timestamp'], errors='coerce',dayfirst=True)
tt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 460287 entries, 0 to 460286
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   reviews     460256 non-null  object        
 1   rating      460287 non-null  int64         
 2   likes       460287 non-null  int64         
 3   appversion  333953 non-null  object        
 4   timestamp   460287 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 17.6+ MB


In [11]:
missing_values = tt.isnull().sum()

print("Missing values before removal:")
print(missing_values)

tt = tt.dropna(subset=['reviews'])
missing_values = tt.isnull().sum()

print("\nMissing values after removal:")
print(missing_values)

Missing values before removal:
reviews           31
rating             0
likes              0
appversion    126334
timestamp          0
dtype: int64

Missing values after removal:
reviews            0
rating             0
likes              0
appversion    126325
timestamp          0
dtype: int64


In [None]:
def replace_emojis_with_text(text):
    demojized_text = emoji.demojize(text)
    
    split_text = ' '.join(wordninja.split(demojized_text)) 
    
    emoji_count = len([char for char in text if char in emoji.EMOJI_DATA])
    
    return split_text, emoji_count 

tt['reviews_original'] = tt['reviews']
tt['reviews'], tt['emoji_count'] = zip(*tt['reviews'].apply(replace_emojis_with_text))

total_emojis_replaced = tt['emoji_count'].sum()
print(f"Total emojis replaced: {total_emojis_replaced}")

print(tt[['reviews_original', 'reviews']].tail())

tt.drop(columns=['emoji_count'], inplace=True)
tt.drop(columns=['reviews_original'], inplace=True)

In [None]:
tt['reviews'] = tt['reviews'].str.lower()
tt['reviews'] = tt['reviews'].str.replace(r'[^a-zA-Z\s]', '', regex=True)

In [None]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    try:
        words = word_tokenize(text)
        filtered_words = [word for word in words if word.lower() not in stop_words]
        return ' '.join(filtered_words)
    except Exception as e:
        print(f"Error cleaning text: {text} - {e}")
        return text
        
tt['reviews_nostopwords'] = tt['reviews'].apply(remove_stopwords)

print(tt[['reviews', 'reviews_nostopwords']].head())

In [None]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    try:
        words = word_tokenize(text)
        lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
        return ' '.join(lemmatized_words)
    except Exception as e:
        print(f"Error lemmatizing text: {text} - {e}")
        return text
        
tt['reviews_lemmatized'] = tt['reviews_nostopwords'].apply(lemmatize_text)

print(tt[['reviews_nostopwords', 'reviews_lemmatized']].head())
tt.drop(columns=['reviews_nostopwords'], inplace=True)

In [None]:
from spellchecker import SpellChecker

spell = SpellChecker()

def remove_misspelled_words(review):
    words = review.split()
    misspelled = spell.unknown(words)
    cleaned_review = ' '.join([word for word in words if word not in misspelled])
    
    return cleaned_review, misspelled 

# Apply the function to the reviews
tt['reviews_no_misspelled_words'], tt['misspelled_words'] = zip(*tt['reviews_lemmatized'].apply(remove_misspelled_words))

# Now filter rows with misspelled words
misspelled_rows = tt[tt['misspelled_words'].apply(lambda x: len(x) > 0)]

total_removed = tt['misspelled_words'].apply(len).sum()

print(f"Total words removed: {total_removed}")

all_misspelled = [word for sublist in tt['misspelled_words'] for word in sublist]
first_100_misspelled = all_misspelled[:100]

print("\nSummary of the first 100 removed misspelled words:")
print(set(first_100_misspelled))  # Remove duplicates

tt.drop(columns=['reviews_lemmatized'], inplace=True)
tt.drop(columns=['misspelled_words'], inplace=True)

In [None]:
redundant_phrases = [
    'tiktok', 'video', 'app', 'content','feature', 'update', 'platform', 'post','social media','service', 'face', 'account', 'download', 'follower']

def remove_redundant_words(text, redundant_phrases):
    for phrase in redundant_phrases:
        text = re.sub(r'\b' + re.escape(phrase) + r'\b', '', text, flags=re.IGNORECASE)
    return text
    
tt['reviews_cleaned'] = tt['reviews_no_misspelled_words'].apply(lambda x: remove_redundant_words(x, redundant_phrases))
tt.drop(columns=['reviews_no_misspelled_words'], inplace=True)

In [None]:
empty_string_rows = tt[tt['reviews_cleaned'] == '']

print(f"Number of rows with empty strings: {empty_string_rows.shape[0]}")

tt['reviews_cleaned'] = tt['reviews_cleaned'].replace('', np.nan)

In [None]:
tt = tt.dropna(subset=['reviews_cleaned'])
missing_values = tt.isnull().sum()

print(tt.shape)
print (missing_values)

In [None]:
tt['reviews_tokenize'] = tt['reviews_cleaned'].apply(lambda x: word_tokenize(x))

In [None]:
plt.style.use('fivethirtyeight')

cp = sns.color_palette()
analyzer = SentimentIntensityAnalyzer()

In [None]:
emptyline=[]
for row in tt['reviews_cleaned']:
    vs=analyzer.polarity_scores(row)
    emptyline.append(vs)
    
tt_sentiments=pd.DataFrame(emptyline)
tt_sentiments.head()

In [None]:
tt_merged = pd.concat([tt.reset_index(drop=True), tt_sentiments], axis=1)
tt_merged.head()

In [None]:
tt_merged.drop(columns=['neg','neu','pos'], inplace=True)

In [None]:
tt_merged['Sentiment'] = np.where(
    tt_merged['compound'] >= 0.35, 'Positive',
    np.where(tt_merged['compound'] <= -0.35 ,'Negative', 'Neutral')
)

In [None]:
# Stacked Bar Chart, Time(Hour) against Sentiment

# Extract the hour from the timestamp
tt_merged['hour'] = tt_merged['timestamp'].dt.hour

sentiment_over_time_of_day = tt_merged.groupby(['hour', 'Sentiment']).size().unstack()

sentiment_over_time_of_day.plot(kind='line', figsize=(10, 6), title="Sentiment Distribution by Hour of Day")
plt.xlabel('Hour of the Day')
plt.ylabel('Review Count')
plt.show()

In [None]:
# Drop columns deemed as unnecessary
tt_merged = tt_merged.drop(columns=['appversion','likes','year_month','compound','timestamp'])
tt_merged.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack, csr_matrix
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, log_loss, f1_score

In [None]:
df = tt_merged.copy()
df.head()

In [None]:
sentiment_mapping = {'positive': 1, 'neutral': 0, 'negative': 2}
df['Sentiment_encoded'] = df['Sentiment'].map(sentiment_mapping)

In [None]:
# 1. Preprocess numeric features
numeric_features = df[['rating', 'hour']].fillna(0)
scaled_numeric = MinMaxScaler().fit_transform(numeric_features)
scaled_numeric_df = pd.DataFrame(scaled_numeric, index=numeric_features.index)

# 2. Stratified split into train, validation, and test sets
X_train, X_remaining, y_train, y_remaining = train_test_split(
    df[['reviews_cleaned', 'reviews_tokenize']],
    df['Sentiment_encoded'],
    test_size=0.3,
    random_state=42,
    stratify=df['Sentiment_encoded']
)

X_val, X_test, y_val, y_test = train_test_split(
    X_remaining,
    y_remaining,
    test_size=0.5,
    random_state=42,
    stratify=y_remaining
)

# 3. Undersample the positive class in the training set
positive_class = X_train[y_train == 1]
neutral_class = X_train[y_train == 0]
negative_class = X_train[y_train == 2]

positive_class_undersampled = positive_class.sample(frac=0.5, random_state=42)
X_train_balanced = pd.concat([positive_class_undersampled, neutral_class, negative_class])
y_train_balanced = y_train.loc[X_train_balanced.index]

# Shuffle the balanced training set
X_train_balanced = X_train_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
y_train_balanced = y_train_balanced.reset_index(drop=True)

# 4. Display class distributions in datasets
def display_class_distribution(y, dataset_name):
    class_counts = y.value_counts()
    class_percentage = y.value_counts(normalize=True) * 100
    distribution = pd.DataFrame({'Count': class_counts, 'Percentage': class_percentage})
    print(f"\nClass distribution in the {dataset_name} set:")
    print(distribution)

display_class_distribution(y_train_balanced, "training set after undersampling")
display_class_distribution(y_val, "validation set")
display_class_distribution(y_test, "testing set")

# Split ratio calculation
total_samples = df.shape[0]
train_ratio = X_train.shape[0] / total_samples * 100
val_ratio = X_val.shape[0] / total_samples * 100
test_ratio = X_test.shape[0] / total_samples * 100

print(f"\nTraining set ratio: {train_ratio:.2f}%")
print(f"Validation set ratio: {val_ratio:.2f}%")
print(f"Testing set ratio: {test_ratio:.2f}%")

# 5. Align numeric features with the split indices and convert to sparse matrices
def get_scaled_sparse_features(X, index):
    return csr_matrix(scaled_numeric_df.loc[index].values)

scaled_train_sparse = get_scaled_sparse_features(X_train, X_train.index)
scaled_val_sparse = get_scaled_sparse_features(X_val, X_val.index)
scaled_test_sparse = get_scaled_sparse_features(X_test, X_test.index)

# 6. TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix_train = vectorizer.fit_transform(X_train['reviews_cleaned'])
tfidf_matrix_val = vectorizer.transform(X_val['reviews_cleaned'])
tfidf_matrix_test = vectorizer.transform(X_test['reviews_cleaned'])

# 7. Combine TF-IDF and numeric features
X_train_tfidf = hstack([tfidf_matrix_train, scaled_train_sparse])
X_val_tfidf = hstack([tfidf_matrix_val, scaled_val_sparse])
X_test_tfidf = hstack([tfidf_matrix_test, scaled_test_sparse])

# 8. Validation: Check the shapes and density
print(f"\nTrain shape: {X_train_tfidf.shape}")
print(f"Test shape: {X_test_tfidf.shape}")
print(f"Validation shape: {X_val_tfidf.shape}")

train_density = X_train_tfidf.nnz / (X_train_tfidf.shape[0] * X_train_tfidf.shape[1])
print(f"Train matrix density: {train_density:.10f}")

In [None]:
def collect_metrics(y_true, y_pred, y_prob, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')
    logloss = log_loss(y_true, y_prob)
    report = classification_report(y_true, y_pred, output_dict=True)
    
    return {
        "Model": model_name,
        "Accuracy": accuracy,
        "F1-Score (Weighted)": f1,
        "Log-Loss": logloss,
        "Precision": report['weighted avg']['precision'],
        "Recall": report['weighted avg']['recall']
    }

def calculate_averages(metrics_dict):
    return {metric: np.mean(values) for metric, values in metrics_dict.items()}
    
def train_and_evaluate_model(model, model_name, kf, X, y):
    metrics_dict = {
        "Accuracy": [],
        "F1-Score (Weighted)": [],
        "Log-Loss": [],
        "Precision": [],
        "Recall": []
    }
    
    for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
        print(f"Fold {fold + 1}/{kf.get_n_splits()} - {model_name}")
        
        # Split data into train and validation sets
        X_train_fold, X_val_fold = X[train_index], X[val_index]
        y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]
        
        # Train the model
        model.fit(X_train_fold, y_train_fold)
        
        # Predict on the validation set
        y_pred = model.predict(X_val_fold)
        
        # Get probabilities if the model supports predict_proba
        if hasattr(model, "predict_proba"):
            y_prob = model.predict_proba(X_val_fold)
        else:
            # Use calibrated probabilities for models like SVM
            calibrated_model = CalibratedClassifierCV(model, method='sigmoid', cv='prefit')
            calibrated_model.fit(X_train_fold, y_train_fold)
            y_prob = calibrated_model.predict_proba(X_val_fold)
        
        # Collect metrics
        metrics = collect_metrics(y_val_fold, y_pred, y_prob, model_name)
        
        # Append metrics for this fold
        for metric, value in metrics.items():
            if metric in metrics_dict:
                metrics_dict[metric].append(value)
        
        # Print metrics for this fold
        print(f"{model_name} Accuracy for fold {fold + 1}: {metrics['Accuracy']}")
        print(f"{model_name} F1-Score for fold {fold + 1}: {metrics['F1-Score (Weighted)']}")
        print(f"{model_name} Log-Loss for fold {fold + 1}: {metrics['Log-Loss']}")
        print(f"{model_name} Precision for fold {fold + 1}: {metrics['Precision']}")
        print(f"{model_name} Recall for fold {fold + 1}: {metrics['Recall']}")
        print("-" * 50)

    averages = calculate_averages(metrics_dict)
    return averages

In [None]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

models_with_best_params = {
    'Logistic Regression': LogisticRegression(
        C=16.671739493308188, 
        max_iter=290, 
        penalty='l2', 
        solver='lbfgs', 
        random_state=42
    )
}

model_results = {}

for model_name, model in models_with_best_params.items():
    
    model_results[model_name] = train_and_evaluate_model(model, model_name, kf, X_train_tfidf, y_train)
    print("-" * 50)

In [None]:
log_reg_model = models_with_best_params['Logistic Regression']

y_pred = log_reg_model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
log_loss_value = log_loss(y_test, log_reg_model.predict_proba(X_test_tfidf))
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("Logistic Regression Test Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1-Score (Weighted): {f1:.4f}")
print(f"Log-Loss: {log_loss_value:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print("-" * 50)