In [3]:
import pandas as pd

# Load the dataset
df = pd.read_csv('dataset.csv', low_memory=False)

# Display basic shape and column info
print("Dataset shape:", df.shape)
print("\nColumn info:")
print(df.info())

# Show first 5 rows
print("\nSample data:")
print(df.head())

Dataset shape: (6417106, 5)

Column info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6417106 entries, 0 to 6417105
Data columns (total 5 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   app_id        int64 
 1   app_name      object
 2   review_text   object
 3   review_score  int64 
 4   review_votes  int64 
dtypes: int64(3), object(2)
memory usage: 244.8+ MB
None

Sample data:
   app_id        app_name                                        review_text  \
0      10  Counter-Strike                                    Ruined my life.   
1      10  Counter-Strike  This will be more of a ''my experience with th...   
2      10  Counter-Strike                      This game saved my virginity.   
3      10  Counter-Strike  • Do you like original games? • Do you like ga...   
4      10  Counter-Strike           Easy to learn, hard to master.             

   review_score  review_votes  
0             1             0  
1             1             1  
2             1 

In [4]:
# Cleaning the dataset

# Drop full duplicate rows
df = df.drop_duplicates()
print("After dropping full duplicates:", df.shape)

# Drop rows with missing review text or app name
df = df.dropna(subset=['review_text', 'app_name'])
print("After dropping rows with null review_text or app_name:", df.shape)

# Remove reviews with very short length (less than 10 characters after stripping)
df['review_text'] = df['review_text'].astype(str)
df = df[df['review_text'].str.strip().str.len() >= 10]
print("After filtering short reviews:", df.shape)

# Review score distribution
print("\nReview score distribution:\n", df['review_score'].value_counts())

# Show sample after cleaning
print("\nSample after cleaning:")
print(df[['app_name', 'review_text', 'review_score']].head())

# Store for future steps
df_clean = df.copy()

After dropping full duplicates: (4621004, 5)
After dropping rows with null review_text or app_name: (4483850, 5)
After filtering short reviews: (4437086, 5)

Review score distribution:
 review_score
 1    3642297
-1     794789
Name: count, dtype: int64

Sample after cleaning:
         app_name                                        review_text  \
0  Counter-Strike                                    Ruined my life.   
1  Counter-Strike  This will be more of a ''my experience with th...   
2  Counter-Strike                      This game saved my virginity.   
3  Counter-Strike  • Do you like original games? • Do you like ga...   
4  Counter-Strike           Easy to learn, hard to master.             

   review_score  
0             1  
1             1  
2             1  
3             1  
4             1  


In [5]:
# Filter games with at least 50 reviews

# Count number of reviews per game
review_counts = df_clean['app_name'].value_counts()

# Filter games with at least 50 reviews
popular_games = review_counts[review_counts >= 50].index
df_filtered = df_clean[df_clean['app_name'].isin(popular_games)]

print("After filtering to popular games (≥ 50 reviews):", df_filtered.shape)

# Check most reviewed games
print("\nTop 5 most reviewed games:")
print(df_filtered['app_name'].value_counts().head())

# Store for next step
df_clean = df_filtered.copy()

After filtering to popular games (≥ 50 reviews): (4355551, 5)

Top 5 most reviewed games:
app_name
Terraria     77598
PAYDAY 2     62932
Dota 2       48949
Undertale    48193
Warframe     44459
Name: count, dtype: int64


In [6]:
import re

# Keep only review_score 1 (positive) and -1 (negative)
df_clean = df_clean[df_clean['review_score'].isin([1, -1])]

# Text cleaning function
def clean_review(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation and numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize whitespace
    return text

# Apply cleaning
df_clean['clean_text'] = df_clean['review_text'].astype(str).apply(clean_review)

# Confirm result
print("Review score distribution after filtering:")
print(df_clean['review_score'].value_counts())

print("\nSample cleaned reviews:")
print(df_clean[['review_text', 'clean_text']].sample(5, random_state=42))

Review score distribution after filtering:
review_score
 1    3586594
-1     768957
Name: count, dtype: int64

Sample cleaned reviews:
                                               review_text  \
1564255  dont buy this game or buy its dlc's. why? you ...   
3136233  A beautiful game of a majestic and partially ♥...   
4578296  Simply great game. One of the best, but Arkham...   
6028133          i recommend league of legends to everyone   
3537423         This game is fantastic. Worth every penny.   

                                                clean_text  
1564255  dont buy this game or buy its dlcs why you can...  
3136233  a beautiful game of a majestic and partially g...  
4578296  simply great game one of the best but arkham c...  
6028133          i recommend league of legends to everyone  
3537423           this game is fantastic worth every penny  


In [7]:
# Balance the dataset (same number of positive and negative reviews)

# Sample 768957 positive reviews (to match negative)
df_pos = df_clean[df_clean['review_score'] == 1].sample(768957, random_state=42)
df_neg = df_clean[df_clean['review_score'] == -1]

# Concatenate and shuffle
df_balanced = pd.concat([df_pos, df_neg], axis=0).sample(frac=1, random_state=42).reset_index(drop=True)

# Confirm shape
print("Balanced review score distribution:")
print(df_balanced['review_score'].value_counts())

# Show some samples
print("\nSample rows:")
print(df_balanced[['review_score', 'clean_text']].head())

Balanced review score distribution:
review_score
 1    768957
-1    768957
Name: count, dtype: int64

Sample rows:
   review_score                                         clean_text
0             1  oh god i cant say anything wrong against this ...
1             1  took me about lives to do the first level anno...
2            -1         brothers a tale of no fun this movie sucks
3            -1  i actually was kind of expecting a similar sty...
4            -1  its ok if flying in the air and futuristic are...


In [8]:
from sklearn.model_selection import train_test_split

# Separate features and labels
X = df_balanced['clean_text']
y = df_balanced['review_score']

# Stratified split: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Confirm shapes and class distribution
print("Training set size:", X_train.shape[0])
print("Test set size:", X_test.shape[0])
print("\nLabel distribution in training set:")
print(y_train.value_counts())
print("\nLabel distribution in test set:")
print(y_test.value_counts())

Training set size: 1230331
Test set size: 307583

Label distribution in training set:
review_score
 1    615166
-1    615165
Name: count, dtype: int64

Label distribution in test set:
review_score
-1    153792
 1    153791
Name: count, dtype: int64


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer
tfidf = TfidfVectorizer(
    max_features=5000,        # Use top 5000 words
    ngram_range=(1, 2),       # Unigrams + bigrams
    stop_words='english'      # Remove common English stop words
)

# Fit on training data and transform both train and test sets
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Show result shape
print("TF-IDF training data shape:", X_train_tfidf.shape)
print("TF-IDF test data shape:", X_test_tfidf.shape)

TF-IDF training data shape: (1230331, 5000)
TF-IDF test data shape: (307583, 5000)


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Initialize and train
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train_tfidf, y_train)

# Predict on test data
y_pred_lr = lr.predict(X_test_tfidf)

# Evaluation
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.8445817876800733

Classification Report:
               precision    recall  f1-score   support

          -1       0.85      0.84      0.84    153792
           1       0.84      0.85      0.85    153791

    accuracy                           0.84    307583
   macro avg       0.84      0.84      0.84    307583
weighted avg       0.84      0.84      0.84    307583


Confusion Matrix:
 [[129098  24694]
 [ 23110 130681]]


In [11]:
from sklearn.svm import LinearSVC

# Train a LinearSVC model
svc = LinearSVC(random_state=42)
svc.fit(X_train_tfidf, y_train)

# Predict and evaluate
y_pred_svc = svc.predict(X_test_tfidf)

print("LinearSVC Accuracy:", accuracy_score(y_test, y_pred_svc))
print("\nClassification Report:\n", classification_report(y_test, y_pred_svc))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_svc))

LinearSVC Accuracy: 0.8450727120809668

Classification Report:
               precision    recall  f1-score   support

          -1       0.85      0.84      0.84    153792
           1       0.84      0.85      0.85    153791

    accuracy                           0.85    307583
   macro avg       0.85      0.85      0.85    307583
weighted avg       0.85      0.85      0.85    307583


Confusion Matrix:
 [[128687  25105]
 [ 22548 131243]]


In [12]:
from sklearn.naive_bayes import MultinomialNB

# Train MNB model
mnb = MultinomialNB()
mnb.fit(X_train_tfidf, y_train)

# Predict and evaluate
y_pred_mnb = mnb.predict(X_test_tfidf)

print("MultinomialNB Accuracy:", accuracy_score(y_test, y_pred_mnb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_mnb))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_mnb))

MultinomialNB Accuracy: 0.8218562144201728

Classification Report:
               precision    recall  f1-score   support

          -1       0.82      0.83      0.82    153792
           1       0.83      0.82      0.82    153791

    accuracy                           0.82    307583
   macro avg       0.82      0.82      0.82    307583
weighted avg       0.82      0.82      0.82    307583


Confusion Matrix:
 [[127438  26354]
 [ 28440 125351]]


In [13]:
import joblib
from sklearn.svm import LinearSVC

# Assign the best model explicitly
best_model = LinearSVC()
best_model.fit(X_train_tfidf, y_train)  # Re-train on the same data

# Save the TF-IDF vectorizer
joblib.dump(tfidf, "tfidf_vectorizer.pkl")

# Save the model
joblib.dump(best_model, "linear_svc_model.pkl")

print("Model and vectorizer saved successfully.")

Model and vectorizer saved successfully.


In [14]:
import joblib
import re

# Load model and vectorizer
model = joblib.load("linear_svc_model.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")

# Basic text cleaning function (same as used before)
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Prediction function
def predict_sentiment(review):
    cleaned = clean_text(review)
    vectorized = vectorizer.transform([cleaned])
    prediction = model.predict(vectorized)[0]
    return "Positive" if prediction == 1 else "Negative"

In [15]:
print(predict_sentiment("This game is amazing! Loved every moment of it."))
print(predict_sentiment("This is a terrible game. Waste of money."))

Positive
Negative


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Filter only positive reviews
df_positive = df_balanced[df_balanced['review_score'] == 1]

# Group positive reviews by app_name
game_reviews = df_positive.groupby('app_name')['clean_text'].apply(lambda x: " ".join(x)).reset_index()

# Vectorize the combined review text per game
tfidf_game = TfidfVectorizer(max_features=5000, stop_words='english')
game_vectors = tfidf_game.fit_transform(game_reviews['clean_text'])

# Compute cosine similarity matrix between games
cosine_sim_matrix = cosine_similarity(game_vectors)

In [17]:
def recommend_similar_games(game_name, top_n=3):
    if game_name not in game_reviews['app_name'].values:
        print(f"Game '{game_name}' not found in the dataset.")
        return []

    # Get index of the input game
    idx = game_reviews[game_reviews['app_name'] == game_name].index[0]

    # Get pairwise similarity scores and sort them
    sim_scores = list(enumerate(cosine_sim_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Skip the first one (it's the input game itself)
    sim_scores = sim_scores[1:top_n + 1]

    # Get game names
    similar_games = [game_reviews.iloc[i[0]]['app_name'] for i in sim_scores]
    
    print(f"Because you liked **{game_name}**, you might also enjoy:")
    for game in similar_games:
        print("→", game)

    return similar_games

In [18]:
recommend_similar_games("Terraria")

Because you liked **Terraria**, you might also enjoy:
→ Magicite
→ Crea
→ Starbound


['Magicite', 'Crea', 'Starbound']