In [40]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import re, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Data Loading
Data url: https://www.kaggle.com/datasets/andrewmvd/steam-reviews

Citation: Antoni Sobkowicz. (2017). Steam Review Dataset (2017) [Data set]. Zenodo. https://doi.org/10.5281/zenodo.1000885

#### Creating A Random Subset 
The original source file is ~6.5 million records, ~700mb, so I used a Bash command to parse the source file for 1000 random records and create a subset to use for the machine learning model.

**$ head -n 1 dataset.csv > random_subset.csv && tail -n +2 dataset.csv | shuf -n 1000 >> random_subset.csv**

#### Importing/Cleaning the Data
Drop any records with missing values, and drop non-relevant columns of ID and Name.

In [24]:
path = 'random_subset.csv'
raw = pd.read_csv(path)
# drop null records
df = raw.dropna()
# drop unneeded id/name
df = df.drop(columns=['app_id', 'app_name'])
df.head()

Unnamed: 0,review_text,review_score,review_votes
0,"Excellent game, especially after resolution/fr...",1,0
1,Not a bad story after all. Music is really bad...,1,0
2,Early Access Review,1,0
3,it is a rather good game which requires alittl...,1,0
4,Fun game that is frustratingly difficult at ti...,1,0


#### Pre-Processing text for the model
Get rid of any punctuation, numbers, special characters, normalize to lower case.

In [31]:
def clean_data(review):
    # remove any character that is not let/num/space
    no_punc = re.sub(r'[^\w\s]', '', review)
    # remove any digits
    no_digits = ''.join([i for i in no_punc if not i.isdigit()])
    # return as lowercase
    return(no_digits.lower())

In [29]:
df['review_text'][0]

'Excellent game especially after resolutionframrate fix Got me scared not because its terrifying it has its moments though but because its friggin hard Seriously Its a game that takes time to learn and rewards patience and perseverance  Controller fuggin required though Dont even bother unless you have that gamepad'

In [33]:
df['review_text'] = df['review_text'].apply(clean_data)
df['review_text'][0]

'excellent game especially after resolutionframrate fix got me scared not because its terrifying it has its moments though but because its friggin hard seriously its a game that takes time to learn and rewards patience and perseverance  controller fuggin required though dont even bother unless you have that gamepad'

#### Text Transformation
Using built in sklearn method, transform text data into frequency vectors (mapping a word's frequency to its estimated importance)

In [35]:
tdidf = TfidfVectorizer(strip_accents=None,
                       lowercase=False,
                       preprocessor=None)

X = tdidf.fit_transform(df['review_text'])

### Split the Dataset into Training/Testing

In [37]:
y = df['review_score'] # target variable
X_train, X_test, y_train, y_test = train_test_split(X, y)

### Model

In [39]:
lr = LogisticRegression(solver='liblinear')
lr.fit(X_train, y_train)
preds = lr.predict(X_test)

### Evaluate
A random guess has a 50% chance of being correct (either the game is recommended or not) so we would ideally want a much higher success percentage or it would not be worth it

In [47]:
score = accuracy_score(preds,y_test)
print("Accuracy: " + "{:.2%}".format(score))

Accuracy: 82.04%
