In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

## Reading and Inspection

In [2]:
# Read the csv file using 'read_csv'
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
game_overview = pd.read_csv('game_overview.csv')

In [3]:
# Top 5 records of train dataframe
train.head()

Unnamed: 0,review_id,title,year,user_review,user_suggestion
0,1,Spooky's Jump Scare Mansion,2016.0,I'm scared and hearing creepy voices. So I'll...,1
1,2,Spooky's Jump Scare Mansion,2016.0,"Best game, more better than Sam Pepper's YouTu...",1
2,3,Spooky's Jump Scare Mansion,2016.0,"A littly iffy on the controls, but once you kn...",1
3,4,Spooky's Jump Scare Mansion,2015.0,"Great game, fun and colorful and all that.A si...",1
4,5,Spooky's Jump Scare Mansion,2015.0,Not many games have the cute tag right next to...,1


In [4]:
# Check the number of rows and columns in the train dataframe
train.shape

(17494, 5)

In [5]:
# Check the column-wise info of the train dataframe
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17494 entries, 0 to 17493
Data columns (total 5 columns):
review_id          17494 non-null int64
title              17494 non-null object
year               17316 non-null float64
user_review        17494 non-null object
user_suggestion    17494 non-null int64
dtypes: float64(1), int64(2), object(2)
memory usage: 683.4+ KB


In [6]:
game_overview.head()

Unnamed: 0,title,developer,publisher,tags,overview
0,Spooky's Jump Scare Mansion,Lag Studios,Lag Studios,"['Horror', 'Free to Play', 'Cute', 'First-Pers...",Can you survive 1000 rooms of cute terror? Or ...
1,Sakura Clicker,Winged Cloud,Winged Cloud,"['Nudity', 'Anime', 'Free to Play', 'Mature', ...",The latest entry in the Sakura series is more ...
2,WARMODE,WARTEAM,WARTEAM,"['Early Access', 'Free to Play', 'FPS', 'Multi...",Free to play shooter about the confrontation o...
3,Fractured Space,Edge Case Games Ltd.,Edge Case Games Ltd.,"['Space', 'Multiplayer', 'Free to Play', 'PvP'...",Take the helm of a gigantic capital ship and g...
4,Counter-Strike: Global Offensive,"Valve, Hidden Path Entertainment",Valve,"['FPS', 'Multiplayer', 'Shooter', 'Action', 'T...",Counter-Strike: Global Offensive (CS: GO) expa...


In [7]:
# Check the number of rows and columns in the game_overview dataframe
game_overview.shape

(64, 5)

In [8]:
# Check the column-wise info of the game_overview dataframe
game_overview.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 5 columns):
title        64 non-null object
developer    64 non-null object
publisher    64 non-null object
tags         64 non-null object
overview     64 non-null object
dtypes: object(5)
memory usage: 2.6+ KB


In [9]:
# Check the number of rows and columns in the test dataframe
test.shape

(8045, 4)

In [10]:
# Check the column-wise info of the test dataframe
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8045 entries, 0 to 8044
Data columns (total 4 columns):
review_id      8045 non-null int64
title          8045 non-null object
year           7978 non-null float64
user_review    8045 non-null object
dtypes: float64(1), int64(1), object(2)
memory usage: 251.5+ KB


## Data Cleaning and Data Preparation

In [11]:
# Get the column-wise Null count

print("Column-wise null count in train dataframe:\n", train.isnull().sum())
print("\n")
print("Column-wise null count in test dataframe:\n", test.isnull().sum())
print("\n")
print("Column-wise null count in game_overview dataframe:\n", game_overview.isnull().sum())

Column-wise null count in train dataframe:
 review_id            0
title                0
year               178
user_review          0
user_suggestion      0
dtype: int64


Column-wise null count in test dataframe:
 review_id       0
title           0
year           67
user_review     0
dtype: int64


Column-wise null count in game_overview dataframe:
 title        0
developer    0
publisher    0
tags         0
overview     0
dtype: int64


In [12]:
# Dropping the null values from the dataframe
train.dropna(inplace=True)

In [13]:
# Changing datatype of column 'year' from float to int
train['year'] = train['year'].apply(np.int64)

In [14]:
train['year'].value_counts()

2018    4822
2016    4226
2017    3890
2015    2460
2014    1499
2013     340
2012      65
2011      14
Name: year, dtype: int64

#### Preparing final dataset for training and testing

In [15]:
# Merging train and test dataframe with game_overview dataframe as train and test df
train = pd.merge(train,game_overview,on='title', how='inner')
train.head()

Unnamed: 0,review_id,title,year,user_review,user_suggestion,developer,publisher,tags,overview
0,1,Spooky's Jump Scare Mansion,2016,I'm scared and hearing creepy voices. So I'll...,1,Lag Studios,Lag Studios,"['Horror', 'Free to Play', 'Cute', 'First-Pers...",Can you survive 1000 rooms of cute terror? Or ...
1,2,Spooky's Jump Scare Mansion,2016,"Best game, more better than Sam Pepper's YouTu...",1,Lag Studios,Lag Studios,"['Horror', 'Free to Play', 'Cute', 'First-Pers...",Can you survive 1000 rooms of cute terror? Or ...
2,3,Spooky's Jump Scare Mansion,2016,"A littly iffy on the controls, but once you kn...",1,Lag Studios,Lag Studios,"['Horror', 'Free to Play', 'Cute', 'First-Pers...",Can you survive 1000 rooms of cute terror? Or ...
3,4,Spooky's Jump Scare Mansion,2015,"Great game, fun and colorful and all that.A si...",1,Lag Studios,Lag Studios,"['Horror', 'Free to Play', 'Cute', 'First-Pers...",Can you survive 1000 rooms of cute terror? Or ...
4,5,Spooky's Jump Scare Mansion,2015,Not many games have the cute tag right next to...,1,Lag Studios,Lag Studios,"['Horror', 'Free to Play', 'Cute', 'First-Pers...",Can you survive 1000 rooms of cute terror? Or ...


In [16]:
# Check the number of rows and columns in the train and test dataframe
print("Size of train dataframe:", train.shape)
print("Size of test dataframe:", test.shape)

Size of train dataframe: (17316, 9)
Size of test dataframe: (8045, 4)


In [17]:
# Get a summary of the train dataframe using 'describe()'
train.describe()

Unnamed: 0,review_id,year,user_suggestion
count,17316.0,17316.0,17316.0
mean,12431.211192,2016.388427,0.570975
std,7653.171476,1.390356,0.494951
min,1.0,2011.0,0.0
25%,5786.75,2015.0,0.0
50%,11764.5,2017.0,1.0
75%,18932.25,2018.0,1.0
max,25537.0,2018.0,1.0


In [18]:
# Checking for duplicate records is train dataframe
train[train.duplicated()]

Unnamed: 0,review_id,title,year,user_review,user_suggestion,developer,publisher,tags,overview


In [19]:
# Checking for duplicate records is test dataframe
test[test.duplicated()]

Unnamed: 0,review_id,title,year,user_review


#### Text Preprocessing

In [20]:
# function to remove stopwords from the text
from nltk.corpus import stopwords

# We would like to remove all stop words like a, is, an, the, ... 
# so we collecting all of them from nltk library
stop_words = set(stopwords.words('english'))

def remove_stopwords(col):
    col = col.str.replace("[^\w\s]", "").str.lower()
    # col = col.apply(lambda x: ' '.join([item for item in x.split() if item not in stop_words]))
    return col.apply(lambda x: ' '.join([item for item in x.split() if item not in stop_words]))  #col.head()

In [21]:
# function for stemming of text
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

def get_stemming_text(col):
    #col = col.str.lower().map(stemmer.stem)
    return col.str.lower().map(stemmer.stem)  #col.head()

In [22]:
# function for lemmatization of text
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def get_lemmatized_text(col):
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in col]

In [23]:
train['user_review'] = remove_stopwords(train['user_review'])
train['user_review'] = get_stemming_text(train['user_review'])
train['user_review'] = get_lemmatized_text(train['user_review'])

In [24]:
test['user_review'] = remove_stopwords(test['user_review'])
test['user_review'] = get_stemming_text(test['user_review'])
test['user_review'] = get_lemmatized_text(test['user_review'])

In [25]:
train.head()

Unnamed: 0,review_id,title,year,user_review,user_suggestion,developer,publisher,tags,overview
0,1,Spooky's Jump Scare Mansion,2016,im scared hearing creepy voice ill pause momen...,1,Lag Studios,Lag Studios,"['Horror', 'Free to Play', 'Cute', 'First-Pers...",Can you survive 1000 rooms of cute terror? Or ...
1,2,Spooky's Jump Scare Mansion,2016,best game better sam pepper youtube account 10...,1,Lag Studios,Lag Studios,"['Horror', 'Free to Play', 'Cute', 'First-Pers...",Can you survive 1000 rooms of cute terror? Or ...
2,3,Spooky's Jump Scare Mansion,2016,littly iffy control know play easy master ive ...,1,Lag Studios,Lag Studios,"['Horror', 'Free to Play', 'Cute', 'First-Pers...",Can you survive 1000 rooms of cute terror? Or ...
3,4,Spooky's Jump Scare Mansion,2015,great game fun colorful thata side note though...,1,Lag Studios,Lag Studios,"['Horror', 'Free to Play', 'Cute', 'First-Pers...",Can you survive 1000 rooms of cute terror? Or ...
4,5,Spooky's Jump Scare Mansion,2015,many game cute tag right next horror tag steam...,1,Lag Studios,Lag Studios,"['Horror', 'Free to Play', 'Cute', 'First-Pers...",Can you survive 1000 rooms of cute terror? Or ...


In [26]:
test.head()

Unnamed: 0,review_id,title,year,user_review
0,1603,Counter-Strike: Global Offensive,2015.0,nice graphic new map weapon model developer li...
1,1604,Counter-Strike: Global Offensive,2018.0,would recommend getting current state csgo hit...
2,1605,Counter-Strike: Global Offensive,2018.0,edit 111218i tried playing csgo recently drama...
3,1606,Counter-Strike: Global Offensive,2015.0,game great community worstif youre match russi...
4,1607,Counter-Strike: Global Offensive,2015.0,thank trulyrazor buying long time ago insisted...


In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

X = tfidf.fit_transform(train['user_review'])
y = train['user_suggestion']
test_v = tfidf.transform(test['user_review'])

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [29]:
# XGB Classifier
from xgboost import XGBClassifier

xgb = XGBClassifier( learning_rate =0.1,
 n_estimators=112,
 max_depth=9,
 min_child_weight=5,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.6,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=13,
 reg_lambda=5,
# max_delta_step=1,
 alpha=0,
 base_score=0.5,
 seed=1029)

xgb.fit(X_train, y_train)

# Predicting the Test set results
y_pred = xgb.predict(X_test)  

# Accuracy of XGB model
accuracy_xgb = round(xgb.score(X_train, y_train) * 100, 2)
print("Accuracy score of XGB algorithm is:", accuracy_xgb)

Accuracy score of XGB algorithm is: 72.9


In [30]:
# print f1 score
from sklearn.metrics import f1_score
f1_score(y_test, y_pred)

0.7868904369854338

In [31]:
# Predicting the Test set results
test_predicted = xgb.predict(test_v)

In [32]:
# load loan_id of test dataset
test_review_id = test['review_id']
print(test_review_id.shape)

(8045,)


In [33]:
# save results to csv
submission_file = pd.DataFrame({'review_id': test_review_id, 'user_suggestion': test_predicted})
submission_file = submission_file[['review_id','user_suggestion']]    
submission_file.to_csv('Final_Solution.csv', index=False)