In [1]:
# !pip install langdetect

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import requests
from time import sleep
import random

from langdetect import detect, DetectorFactory

import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

In [3]:
test_df = pd.read_csv('~/Downloads/test.csv')

test_df = test_df[['title', 'user_review']]

In [4]:
# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove escape characters
    text = re.sub(r'\\[a-z]', '', text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stop words and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    # Join tokens back into a string
    cleaned_text = ' '.join(tokens)
    return cleaned_text

# Seed for consistent language detection
DetectorFactory.seed = 0

# Function to detect language
def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

In [5]:
#Text cleaning

#only English reviews
test_df['is_english'] = test_df['user_review'].apply(is_english)
english_reviews_df = test_df[test_df['is_english'] == True]

# Drop the 'is_english' helper column if needed
english_reviews_df = english_reviews_df.drop(columns=['is_english'])

# Apply the cleaning function to the dataset
english_reviews_df['cleaned_review'] = english_reviews_df['user_review'].apply(clean_text)

#Dropping original review column
english_reviews_df = english_reviews_df.drop(columns=['user_review'])

# Change h1z1 to Z1 Battle Royale
english_reviews_df.loc[english_reviews_df['title'] == 'H1Z1', 'title'] = 'Z1 Battle Royale'

In [6]:
english_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7957 entries, 0 to 8044
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   title           7957 non-null   object
 1   cleaned_review  7957 non-null   object
dtypes: object(2)
memory usage: 186.5+ KB


In [7]:
# english_reviews_df.to_csv('ds4300_final.csv', index=False)

In [8]:
english_reviews_df.title.unique()

array(['Counter-Strike: Global Offensive', 'World of Warships',
       'Star Trek Online', 'Paladins®', 'Shadowverse CCG',
       'Tree of Savior (English Ver.)', 'VEGA Conflict', 'Minion Masters',
       'The Lord of the Rings Online™', 'Fishing Planet', 'Crush Crush',
       'Dungeon Defenders II', 'Governor of Poker 3',
       'Digimon Masters Online', 'Shakes and Fidget', 'Champions Online',
       'Magic Duels', 'Aura Kingdom', 'Z1 Battle Royale', 'GUNS UP!'],
      dtype=object)

In [14]:
# Iterate through unique titles
for title in english_reviews_df['title'].unique():
    # Clean the title to create a valid filename
    clean_title = title.replace(" ", "_").replace(":", "").replace("-", "_").replace(".", "")
    # Create a new dataframe for the title
    new_df = english_reviews_df.loc[english_reviews_df['title'] == title, ['title', 'cleaned_review']]
    # Save the dataframe as a CSV file
    new_df.to_csv(f'{clean_title}_reviews.csv', index=False)

In [13]:
english_reviews_df.head()

Unnamed: 0,title,cleaned_review
0,Counter-Strike: Global Offensive,nice graphic new map weapon model developer li...
1,Counter-Strike: Global Offensive,would recommend getting current state csgo hit...
2,Counter-Strike: Global Offensive,edit 111218i tried playing csgo recently drama...
3,Counter-Strike: Global Offensive,game great community worstif youre match russi...
4,Counter-Strike: Global Offensive,thank trulyrazor buying long time ago insisted...


In [15]:
english_reviews_df.title.unique()

array(['Counter-Strike: Global Offensive', 'World of Warships',
       'Star Trek Online', 'Paladins®', 'Shadowverse CCG',
       'Tree of Savior (English Ver.)', 'VEGA Conflict', 'Minion Masters',
       'The Lord of the Rings Online™', 'Fishing Planet', 'Crush Crush',
       'Dungeon Defenders II', 'Governor of Poker 3',
       'Digimon Masters Online', 'Shakes and Fidget', 'Champions Online',
       'Magic Duels', 'Aura Kingdom', 'Z1 Battle Royale', 'GUNS UP!'],
      dtype=object)