In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')

  import pandas.util.testing as tm


In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## Train Data

<b> Penjelasan Singkat tentang Feature </b>

- Title : Judul Game
- Year : Tahun ketika review ditulis
- User_Review : User review dari platform stream
- User_Suggestion : Vairabel binary yang menjelaskan apakah seorang pemain menyarankan untuk membeli game tersebut atau tidak

In [3]:
# Train Head
train.head()

Unnamed: 0,review_id,title,year,user_review,user_suggestion
0,1,Spooky's Jump Scare Mansion,2016.0,I'm scared and hearing creepy voices. So I'll...,1
1,2,Spooky's Jump Scare Mansion,2016.0,"Best game, more better than Sam Pepper's YouTu...",1
2,3,Spooky's Jump Scare Mansion,2016.0,"A littly iffy on the controls, but once you kn...",1
3,4,Spooky's Jump Scare Mansion,2015.0,"Great game, fun and colorful and all that.A si...",1
4,5,Spooky's Jump Scare Mansion,2015.0,Not many games have the cute tag right next to...,1


In [4]:
# Info data train
pd.DataFrame({
    'Features' : train.columns,
    'Data Type' : [train[x].dtypes for x in train.columns],
    'unique' : [len(train[x].unique()) for x in train.columns],
    'null' : [train[x].isnull().sum() for x in train.columns],
    'null Pct' : [train[x].isnull().sum()/len(train[x]) for x in train.columns]
})

Unnamed: 0,Features,Data Type,unique,null,null Pct
0,review_id,int64,17494,0,0.0
1,title,object,44,0,0.0
2,year,float64,9,178,0.010175
3,user_review,object,17490,0,0.0
4,user_suggestion,int64,2,0,0.0


> Terdapat 178 (0.01%) null value pada features year. 

## Test Data

In [5]:
# Train Head
test.head()

Unnamed: 0,review_id,title,year,user_review
0,1603,Counter-Strike: Global Offensive,2015.0,"Nice graphics, new maps, weapons and models. B..."
1,1604,Counter-Strike: Global Offensive,2018.0,I would not recommend getting into this at its...
2,1605,Counter-Strike: Global Offensive,2018.0,Edit 11/12/18I have tried playing CS:GO recent...
3,1606,Counter-Strike: Global Offensive,2015.0,The game is great. But the community is the wo...
4,1607,Counter-Strike: Global Offensive,2015.0,I thank TrulyRazor for buying this for me a lo...


In [6]:
# Info data train
pd.DataFrame({
    'Features' : test.columns,
    'Data Type' : [test[x].dtypes for x in test.columns],
    'unique' : [len(test[x].unique()) for x in test.columns],
    'null' : [test[x].isnull().sum() for x in test.columns],
    'null Pct' : [test[x].isnull().sum()/len(test[x]) for x in test.columns]
})

Unnamed: 0,Features,Data Type,unique,null,null Pct
0,review_id,int64,8045,0,0.0
1,title,object,20,0,0.0
2,year,float64,9,67,0.008328
3,user_review,object,8045,0,0.0


> Terdapat 178 (0.01%) null value pada features year.

## Analisis pada column judul

In [7]:
datas = {'overview': overview, 'train': train, 'test':test}

for nama,data in datas.items():
    print('Banyak judul data {} = {}'.format(nama, len(data['title'].unique())))

NameError: name 'overview' is not defined

In [None]:
# Unique judul pada masing-masing dataset
judul_train = set(train['title'].unique())
judul_test = set(test['title'].unique())
judul_overview = set(overview['title'].unique())

# Judul yang sama dari masing-masing dataset
train_test = set.intersection(judul_train, judul_test)
train_overview = set.intersection(judul_train, judul_overview)
test_overview = set.intersection(judul_test, judul_overview)

print('Banyak judul yang sama antara data Train dan Test : {}'.format(len(train_test)))
print('Banyak judul yang sama antara data Train dan Overview : {}'.format(len(train_overview)))
print('Banyak judul yang sama antara data Test dan Overview : {}'.format(len(test_overview)))

> <b>Kesimulan</b>
- Tidak ada judul yang sama antara data Train dan Test 
- Judul pada data Train dan Test termuat pada data Overview


## Analysis pada colom tahun

### 1.1 Data Test

In [None]:
test['year'].value_counts(normalize = True)

In [None]:
sns.countplot(test['year'])
plt.show()

sns.countplot(train['year'], hue=train['user_suggestion'])
plt.show()

## Analysis pada data user reviews

In [None]:
# Sample
train['user_review'][0]

In [None]:
train ['panjang_review'] = train['user_review'].apply(lambda x:len(x))
train.head()

In [None]:
sns.set_style('whitegrid')
sns.distplot(train['panjang_review'], bins =20, kde=False)
plt.title('Distribusi User Review')

In [None]:
df =train['panjang_review'].describe()
df

In [None]:
# Perbandingan user suggestion data sudah cukup balance
train['user_suggestion'].value_counts(normalize = True)

In [None]:
sns.distplot(train[train['user_suggestion']==1]['panjang_review'], bins =20, kde=False)
sns.distplot(train[train['user_suggestion']==0]['panjang_review'], bins =20, kde=False)
plt.title('Distribusi User Review')

In [None]:
train[train['user_suggestion'] == 0]['user_review'].iloc[1]

In [None]:
train[train['user_suggestion'] == 1]['user_review'].iloc[1]

# Data Preprocessing

## 1. Check Bahasa 

In [None]:
from langdetect import detect
from tqdm import tqdm_notebook

In [None]:
detect(train['user_review'].iloc[1])

In [None]:
def bahasa(X):
    try:
        return detect(X)
    except:
        return "not-en"

In [None]:
train['language'] = train['user_review'].apply(lambda x: bahasa(x))
train

In [None]:
train[train['language'] != 'en'].head()

In [None]:
train['language'].value_counts(normalize=True)

In [None]:
# Drop yang bahasanya bukan inggris
index_notEng = train[train['language']!='en'].index
train.drop(index_notEng, inplace = True)
train[train['language']!='en']

In [None]:
train

In [None]:
test['language'] = test['user_review'].apply(lambda x: bahasa(x))
test

In [None]:
train['user_review'][0]

In [None]:
train['user_review'][1]

In [None]:
train['user_review'][2]

In [None]:
train['user_review'][4]

In [None]:
train['user_review'][500]

In [None]:
train['user_review'][501]

## Cleaning Test

In [None]:
1. Membuat kata 'Early Access Review'
2. Melakukan split antara angka denga huruf
3. Split untuk CamelCase

In [None]:
import re
from re import finditer

def remove_EAR(X):
    """
    Removing 'Early Access Review'
    """
    X = X.replace("Early Access Review", "")
    
    return X

def split_number_and_text(X):
    x = re.split('(\d+)', X)
    x = " ".join(x)
    x = x.strip()
    
    return x

def handle_camelcase(X):
    matches = finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', X)
    return " ".join([m.group(0) for m in matches])

def handling_whitespaces(X):
    X = " ".join(X.split())
    X = X.strip()
    
    return X

waste_symbols = "人̳⣟⣦̪⠓▒͎¸⠟⣅>⡾ ⠻⣀⣛„ͭ⣮⡻⠦⡀͐‘̨⣆̤⣿<／丶⣞͇⣵͞⠹ͩ⢒̯⢸⣤̗̫ͯ͆̔͠⠛⢻⠏-́☐̺͛̋⠸⣥⠄̷＼͟·⌒͗⠁́｀⢹\\⢄͈̌ͨ⢤彡~¯/⠶⠲ˆ⡥̮̻͔☉⣻̣ゝ⡞̧͙̿̒̊̑ノ⠭ͤ_⠐⣇҉̚–⡄´̓█▄☑⣧̴͖̍｜⣷̭͘͝｡⠴̜̄ʖ¨̵̏͢⢂͋;͒:⢉つ̾＿̈⣴⣌ͫ⢛⡹⣈へ⢯,̅⣭̩̬̕⡈ム͡⣼ͦ)̛͜ヽ̝̥⣠⢟̶⠤̡͉⠘̹̈́⡴̠⢀）⠇⣾͊⢰̞ͮ̇`⠑⡿\u3000⠃⣸⠾͍̆ͅ￣⢚̓⠂⡵─⢬ー⠿(⠆⠉̦*͕ﾉ⣹⡟⣬⠙▓⡐7͏̟̲⢿⢦（̰♥̸̢⣙͓̂▀くﾌ⠀.⠰⡒°̖̎､⣒⣰̼⢅⣁⠒͑⢾⡂͌̀ͧ…̃▐ﾚ、丿⢌|̱⢴⡠⣩▌⣉͚ͪ'⢆⢠⡇⡛⣏⡶⣜⣄⡸⠈̘ͣ⣽̉̽̐ͥ⡏ͬ⣗⣶░⠋⠔̙͂^"

def remove_waste_symbols(X):
    for item in waste_symbols:
        X = X.replace(item, " ")
        
    return X

In [None]:
def clean_review(X):
    X = remove_EAR(X)
    X = remove_waste_symbols(X)
    X = handle_camelcase(X)
    X = split_number_and_text(X)
    X = handling_whitespaces(X)
    
    return X

In [None]:
%%time

train['user_review_clean'] = train['user_review'].apply(lambda x: clean_review(x))
test['user_review_clean'] = test['user_review'].apply(lambda x: clean_review(x))

In [None]:
train

In [None]:
train['user_review'][501]

In [None]:
train['user_review_clean'][501]

In [None]:
train['user_review'][1]

In [None]:
train['user_review_clean'][1]

In [None]:
train.reset_index(drop=True, inplace=True)

## Vectorizing the reviews

- CountVectorizer
- TFIDFVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

sample_texts = ['Hello this is review number 1, Bye Bye', 'I am not a review']

vect = CountVectorizer()
vect.fit(sample_texts)

vect.vocabulary_

In [None]:
for item in sample_texts:
    
    print("Text : {}\nEncoded Format : {}".format(item, vect.transform([item]).toarray()))

In [None]:
X = pd.DataFrame(sample_texts, columns=['text'])
enc_texts = vect.transform(X['text'].values)
enc_texts = pd.DataFrame(enc_texts.toarray(), columns=vect.get_feature_names())

X = pd.concat([X, enc_texts], axis=1)
X.head(2)

In [None]:
%%time

total_reviews = pd.concat([train['user_review_clean'], test['user_review_clean']], axis=0)
total_reviews.reset_index(drop=True, inplace=True)

vect = CountVectorizer()

vect.fit(total_reviews.values)

train_count_vect = vect.transform(train['user_review_clean'].values)
test_count_vect = vect.transform(test['user_review_clean'].values)

print("Number of features / words in vocab : {}".format(len(vect.get_feature_names())))

In [None]:
total_reviews

In [None]:
train['user_review_clean']

## Evaluating

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(train_count_vect, train['user_suggestion'], test_size=0.15, random_state=13)

In [None]:
model = LogisticRegression()

model.fit(X_train, y_train)

train_score = accuracy_score(y_train, model.predict(X_train))
valid_score = accuracy_score(y_valid, model.predict(X_valid))

print("Train Score : {}\nValid Score : {}".format(train_score, valid_score))

## Cross validation score

In [None]:
results = cross_val_score(model, train_count_vect, train['user_suggestion'].values, cv=3, scoring='accuracy')

print("Accuracy Mean : \t{}\n3-Fold Scores : \t{}".format(results.mean(), results))

# TFIDVECTORIZER

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
%%time

vect = TfidfVectorizer()

vect.fit(total_reviews.values)

train_tfidf_vect = vect.transform(train['user_review_clean'].values)
test_tfidf_vect = vect.transform(test['user_review_clean'].values)

print("Number of features / words in vocab : {}".format(len(vect.get_feature_names())))

In [None]:
from nltk.corpus import stopwords
english_stopwords = stopwords.words('english')

print(english_stopwords[: 5])

vect = TfidfVectorizer(stop_words=english_stopwords)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(train_tfidf_vect, train['user_suggestion'], test_size=0.15, random_state=13)

In [None]:
model = LogisticRegression()

model.fit(X_train, y_train)

train_score = accuracy_score(y_train, model.predict(X_train))
valid_score = accuracy_score(y_valid, model.predict(X_valid))

print("Train Score : {}\nValid Score : {}".format(train_score, valid_score))

In [None]:
results = cross_val_score(model, train_tfidf_vect, train['user_suggestion'].values, cv=3, scoring='accuracy')

print("Accuracy Mean : \t{}\n3-Fold Scores : \t{}".format(results.mean(), results))

In [None]:
word = 'wah so fun, i like this game... its realy myFavorite'
clean_word = [clean_review(word)]
clean_word
vect.transform(clean_word)

In [None]:
model.predict(vect.transform(clean_word))