In [147]:
import pandas
import string
import re
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from collections import Counter

df = pandas.read_csv('../MS1 Dataset [ORIG]/games-regression-dataset.csv')
print(df)

                                                    URL          ID  \
0     https://apps.apple.com/us/app/heir-of-light/id...  1264483706   
1     https://apps.apple.com/us/app/endgame-eurasia/...   607705356   
2     https://apps.apple.com/us/app/free-solitaire/i...   627491527   
3     https://apps.apple.com/us/app/draft-trainer/id...   430252596   
4     https://apps.apple.com/us/app/rogue-knight-inf...  1115082819   
...                                                 ...         ...   
5209  https://apps.apple.com/us/app/plague-inc/id525...   525818839   
5210  https://apps.apple.com/us/app/jeans-club/id945...   945975522   
5211  https://apps.apple.com/us/app/train-game-assis...   550919302   
5212  https://apps.apple.com/us/app/independence-day...  1086647459   
5213  https://apps.apple.com/us/app/order-up-food-tr...   542784161   

                                            Name  \
0                                  HEIR OF LIGHT   
1                                Endgame:Eu

In [148]:
df['Description']

0       A Dark Fantasy, Collectible RPG\n\nDarkness ha...
1       "This interactive experience is an exploration...
2       Same Solitaire game with classic Solitaire run...
3       ** Discounted for a limited time **\n\nEver wo...
4       Fight or sneak your way through hordes of mons...
                              ...                        
5209    "Can you infect the world? Plague Inc. is a un...
5210    "Jean start new business, managing club after ...
5211    "Train Game Assistant supplements the board ga...
5212    "Earth stands united!\n\nJoin the combat ranks...
5213    With over 8 Million Downloads and a 4.5 Star R...
Name: Description, Length: 5214, dtype: object

In [149]:
# convert text to lowercase
df['clean_text'] = df['Description'].str.lower()

In [150]:
# remove punctuations
def remove_punctuations(text):
    return text.translate(str.maketrans("", "", string.punctuation))

df['clean_text'] = df['clean_text'].apply(lambda x: remove_punctuations(x))
df['clean_text']

0       a dark fantasy collectible rpgnndarkness has o...
1       this interactive experience is an exploration ...
2       same solitaire game with classic solitaire run...
3        discounted for a limited time nnever wonder h...
4       fight or sneak your way through hordes of mons...
                              ...                        
5209    can you infect the world plague inc is a uniqu...
5210    jean start new business managing club after th...
5211    train game assistant supplements the board gam...
5212    earth stands unitednnjoin the combat ranks of ...
5213    with over 8 million downloads and a 45 star ra...
Name: clean_text, Length: 5214, dtype: object

In [151]:
# remove stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join(word for word in text.split() if word not in stop_words)

df['clean_text'] = df['clean_text'].apply(lambda x: remove_stopwords(x))
df['clean_text']

0       dark fantasy collectible rpgnndarkness overtak...
1       interactive experience exploration modern war ...
2       solitaire game classic solitaire running windo...
3       discounted limited time nnever wonder protour ...
4       fight sneak way hordes monsters attempt hunt d...
                              ...                        
5209    infect world plague inc unique mix high strate...
5210    jean start new business managing club big hits...
5211    train game assistant supplements board game ra...
5212    earth stands unitednnjoin combat ranks earth s...
5213    8 million downloads 45 star rating order go sm...
Name: clean_text, Length: 5214, dtype: object

In [152]:
# remove frequently words
word_count = Counter()
for txt in df["clean_text"]:
    for wrd in txt.split():
        word_count[wrd] += 1

# frequently words occur more than 2000
frequent_words = set(word for (word, frequency) in word_count.most_common(14))

def remove_frequently_words(text):
    return " ".join([word for word in text.split() if word not in frequent_words])


df['clean_text'] = df['clean_text'].apply(lambda x: remove_frequently_words(x))
df['clean_text']

0       dark fantasy collectible rpgnndarkness overtak...
1       interactive experience exploration modern war ...
2       solitaire classic solitaire running windows be...
3       discounted limited nnever wonder protour magic...
4       fight sneak way hordes monsters attempt hunt d...
                              ...                        
5209    infect plague inc unique mix high terrifyingly...
5210    jean start business managing club big hits jea...
5211    train assistant supplements board rail baronxa...
5212    earth stands unitednnjoin combat ranks earth s...
5213    8 million downloads 45 star rating order go sm...
Name: clean_text, Length: 5214, dtype: object

In [153]:
# remove special characters
def remove_special_characters(text):
    text = re.sub("[^a-zA-Z0-9]", " ", text)
    text = re.sub("\s+", " ", text)
    return text
df['clean_text'] = df['clean_text'].apply(lambda x: remove_special_characters(x))
df['clean_text']

0       dark fantasy collectible rpgnndarkness overtak...
1       interactive experience exploration modern war ...
2       solitaire classic solitaire running windows be...
3       discounted limited nnever wonder protour magic...
4       fight sneak way hordes monsters attempt hunt d...
                              ...                        
5209    infect plague inc unique mix high terrifyingly...
5210    jean start business managing club big hits jea...
5211    train assistant supplements board rail baronxa...
5212    earth stands unitednnjoin combat ranks earth s...
5213    8 million downloads 45 star rating order go sm...
Name: clean_text, Length: 5214, dtype: object

In [154]:
# stemming
porter_stemmer = PorterStemmer()
def apply_stemming(text):
    return " ".join([porter_stemmer.stem(word) for word in text.split()])
df['clean_text'] = df['clean_text'].apply(lambda x: apply_stemming(x))
df['clean_text']

0       dark fantasi collect rpgnndark overtaken realm...
1       interact experi explor modern war fiction news...
2       solitair classic solitair run window best kill...
3       discount limit nnever wonder protour magic gat...
4       fight sneak way hord monster attempt hunt dark...
                              ...                        
5209    infect plagu inc uniqu mix high terrifyingli r...
5210    jean start busi manag club big hit jean boutiq...
5211    train assist supplement board rail baronxa mak...
5212    earth stand unitednnjoin combat rank earth spa...
5213    8 million download 45 star rate order go smash...
Name: clean_text, Length: 5214, dtype: object

In [None]:
# part of speech and lemmatization
