In [1]:
# import main libraries/packages
import warnings
warnings.filterwarnings('ignore') # to ignore annoying IPython warnings
import numpy as np
import pandas as pd
import gzip
import re

from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix, hstack # to get memory-efficient representation of matrices (sparse format)
from textblob import TextBlob, Word

# preprocessing / feature extraction / feature transformation
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.decomposition import PCA, SparsePCA

# models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.pipeline import Pipeline, FeatureUnion

# metrics/validation
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold 
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix

# model serialization/deserialization
import dill

In [None]:
%%time
import pandas as pd
import gzip
from sklearn.model_selection import train_test_split

path = 'D:/4_Учебное/DataAnalysis/Kaggle/data/reviews_Movies_and_TV_5.json.gz'

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

df = getDF(path)  
train, test = train_test_split(df.asin.unique(),test_size=0.1, random_state=42)
df_train = df[df.asin.isin(train) & (df.overall != 3)].copy()
df_test = df[df.asin.isin(test) & (df.overall != 3)].copy()
df_train['overall'] = df_train['overall'].apply(lambda x: 1 if x > 3 else 0)
df_test['overall'] = df_test['overall'].apply(lambda x: 1 if x > 3 else 0)
df_train[['overall','reviewText']].to_csv('train.csv', index=False)
df_test[['overall','reviewText']].to_csv('test.csv', index=False)

In [2]:
# load datasets
train_df_names = ['train.csv']

df = pd.concat((pd.read_csv(name, engine='c', sep=',', 
                 usecols=['overall', 'reviewText']) for name in train_df_names), ignore_index=True)

#data_MR = pd.read_csv("book_reviews.csv", sep="|")

#rename var
df = df.rename(columns={'overall' : 'label', 'reviewText' : 'text'})
print('review count: {}'.format(len(df)))

review count: 1343971


In [3]:
df.head()

Unnamed: 0,label,text
0,1,This is a charming version of the classic Dick...
1,1,Henry Winkler is very good in this twist on th...
2,1,This is one of the best Scrooge movies out. H...
3,1,This has been a favorite movie of mine for a l...
4,1,This is the American adaptation of the Charles...


In [4]:
%%time
df.text = df.text.apply(str)
# check for class balance
print('class balance:', '\n', df.label.value_counts())

class balance: 
 1    1158561
0     185410
Name: label, dtype: int64
Wall time: 739 ms


In [5]:
%%time
def get_rate(s):
    candidates = re.findall(r'(\d{1,3}[\\|/]{1}\d{1,2})', s)
    rates = []
    for c in candidates:
        try:
            rates.append(eval(c))
        except SyntaxError:
            pass
        except ZeroDivisionError:
            return 0
    return np.median(rates)

# regular expression to split review on sentences
sentence_splitter = re.compile('(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=\!|\?|\.)\s')

# lists of positive/negative smiles
positive_smiles = set([
":‑)",":)",":-]",":]",":-3",":3",":->",":>","8-)","8)",":-}",":}",":o)",":c)",":^)","=]","=)",":‑D",":D","8‑D","8D",
"x‑D","xD","X‑D","XD","=D","=3","B^D",":-))",";‑)",";)","*-)","*)",";‑]",";]",";^)",":‑,",";D",":‑P",":P","X‑P","XP",
"x‑p","xp",":‑p",":p",":‑Þ",":Þ",":‑þ",":þ",":‑b",":b","d:","=p",">:P", ":'‑)", ":')",  ":-*", ":*", ":×"
])
negative_smiles = set([
":‑(",":(",":‑c",":c",":‑<",":<",":‑[",":[",":-||",">:[",":{",":@",">:(","D‑':","D:<","D:","D8","D;","D=","DX",":‑/",
":/",":‑.",'>:\\', ">:/", ":\\", "=/" ,"=\\", ":L", "=L",":S",":‑|",":|","|‑O","<:‑|"
])

# pattern to catch SUCH WORDS and ignore SuCH :)
uppercase_pattern = re.compile(r'(\b[0-9]*[A-Z]+[0-9]*[A-Z]{1,}[0-9]*\b)')

# contrast conjugations
contrast_conj = set([
'alternatively','anyway','but','by contrast','differ from','elsewhere','even so','however','in contrast','in fact',
'in other respects','in spite of','in that respect','instead','nevertheless','on the contrary','on the other hand',
'rather','though','whereas','yet'])

# to get review "purity" ~ same sentiment over review (~1) or not (~0)
def purity(sentences):
    polarities = np.array([TextBlob(x).sentiment.polarity for x in sentences])
    return polarities.sum() / np.abs(polarities).sum()

# feature engineering ^-^
def get_custom_features(text):
    # assume text = pd.Series with review text
    print('extracting custom features...')
    tdf = pd.DataFrame()
    tdf['text'] = text 
    tdf['sentences'] = tdf.text.apply(lambda s: re.split(sentence_splitter, s)) # split to sentences
    
    # feature 4 - totally uppercase words (like HOLY JESUS!)
    tdf['upper_word_cnt'] = tdf.text.apply(lambda s: len(re.findall(uppercase_pattern, s)))
    
    # try to extract rating :) like "great film. 9/10" will yield 0.9
    tdf['rating'] = tdf['text'].apply(get_rate).fillna(-1) # feature 5 - rating (if found in review)

    # try to extract smiles and count positive/negative smiles per review (features 6,7)
    tdf['positive_smiles'] = tdf.text.apply(lambda s: len([x for x in s.split() if x in positive_smiles]))
    tdf['negative_smiles'] = tdf.text.apply(lambda s: len([x for x in s.split() if x in negative_smiles]))
     
    return csr_matrix(tdf[tdf.columns[2:]].values) # to get sparse format

Wall time: 0 ns


In [6]:
extraction_list = []

# 1. custom features
extraction_list.append(['custom_features', 
                             FunctionTransformer(func=get_custom_features,
                                                 validate=False,
                                                 accept_sparse=True
                                                )
                            ])
# 2. simple bag-of-words (tf-idf)
extraction_list.append(['tfidf', 
                             TfidfVectorizer(decode_error='ignore',
                                             max_df=0.75, 
                                             min_df=3,
                                             ngram_range=(1, 3),
                                             max_features=None,
                                             stop_words='english'
                                            )
                            ])

extractor = FeatureUnion(extraction_list)

In [7]:
clf = ExtraTreesClassifier(n_estimators=50, 
                             max_leaf_nodes=None, 
                             verbose=1,
                             min_samples_leaf=3, 
                             random_state=1,
                             n_jobs=-1,
                             class_weight='balanced',
                             criterion='entropy'
                            )

In [8]:
# create pipeline, combining steps together                                                                                                                       

model = Pipeline(
    [
        ('feature_extraction', extractor),
        ('clf', clf)
    ])

In [9]:
%%time
df_small = df.sample(100000)
#df_small = pd.concat([df[df.label == 1].sample(100000, random_state=1),  df[df.label == 0].sample(100000, random_state=1)])
print('class balance:', '\n', df_small.label.value_counts())

class balance: 
 1    86205
0    13795
Name: label, dtype: int64
Wall time: 142 ms


In [10]:
df_small.head()

Unnamed: 0,label,text
254428,1,Gentlemen Prefer BlondesI love old movies and ...
1157438,1,As a kid I loved watching Ma & Pa Kettle and t...
409265,1,One of the all time greats. This is a impressi...
1200956,1,interesting video...well i was expecting more ...
1190002,1,The Hostel movies are all in my opinion bloody...


In [11]:
%%time
# train/test split
X_train, X_test, y_train, y_test = train_test_split(
                                                    df_small.text, 
                                                    df_small.label, 
                                                    test_size=0.1, 
                                                    random_state=42, 
                                                    stratify=df_small.label
                                                   )



# fit model
model.fit(X_train, y_train)
print('finally fitted :)')

#check results on validation
print('Accuracy on validation: {}'.format(accuracy_score(model.predict(X_test), y_test)))

from sklearn.metrics import classification_report
print(classification_report(y_test, model.predict(X_test)))

extracting custom features...


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   59.6s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.1min finished


finally fitted :)


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.9s finished


extracting custom features...
Accuracy on validation: 0.9018
Wall time: 3min 50s


In [14]:
%%time
df_test_data = pd.read_csv('test.csv', sep=",", engine='c', usecols=['overall', 'reviewText'])
df_test_data = df_test_data.rename(columns={'overall' : 'label', 'reviewText' : 'text'})
df_test_data.text = df_test_data.text.apply(str)

X, y = df_test_data.text, df_test_data.label  # use binary labels = {0-negative,1-positive}
print('accuracy: {}'.format(accuracy_score(y, model.predict(X))))

print(classification_report(y, model.predict(X))) # и смотреть на значение f1-score в строке для класса 0
#print('f1-score: {}'.format(f1_score(y, model.predict(X))))

extracting custom features...


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   12.3s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   14.0s finished


accuracy: 0.9035794036516485
extracting custom features...


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   12.4s


f1-score: 0.9429582745664852
Wall time: 4min 22s


[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   14.2s finished


In [None]:
import os

filename = 'ExtraTrees_MovTv_5-model.pkl'
try:
    with open(filename, 'wb') as f:
        print('saving model...')
        dill.dump(model, f)
        print('model saved in file {}'.format(os.getcwd() + os.sep + filename))
except:
    print('Errors in model dump...')