In [22]:
import pandas as pd
import numpy as np
import re


from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.metrics import accuracy_score

# NLTK for text mining
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

#### Load the data

In [23]:
df = pd.read_csv('./Reviews.csv')

#filling missing text
df.Text.fillna('', inplace=True)
df.Summary.fillna('', inplace=True)

features = list(df.keys())
features.remove('Score')
target = 'Score'

# Selecting 56 K reviews for faster testing
# comment to apply to the whole data
df, _, _, _ = train_test_split(df, df[target], test_size=0.90, random_state=42)


# Stratified sampling to train and test data
x_train, x_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)
#x_train, x_val, y_train, y_val = train_test_split(x_train,y_train, test_size=0.2, random_state=42)

In [24]:
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
50249,50250,B000FBQ50I,A1SB8CXAUIKT8X,Rex N. Mills,3,3,5,1184544000,Very Good!,Great texture and balance of sweet to butter i...
94817,94818,B0029NM8KQ,A1X2KI19WPUN1B,"Patricia C. Smith ""pspuddlejumper""",1,1,5,1316304000,My dog loves Cesar Grilled Chicken,I had looked for years for something that my d...
305269,305270,B005PIJQC0,A1KSPYESOGAZ0I,Jon,0,0,5,1298505600,Good Hydrations,I've used Cytomax for the past couple of years...
126046,126047,B001DWEFMS,A1XGFW5016CGQI,Cathio,0,0,5,1301529600,FROSTING IS LIKE OLD FASHIONED HOME MADE,We just love these mixes. They are all delicio...
445379,445380,B001JP7G4I,A102MAW3UT9B9P,"Martin Alesia ""Marty 4242""",0,0,1,1344038400,Don't Buy!,dont waste your money. this is a piece of junk...


In [25]:
df.shape

(56845, 10)

#### Helper class for text preprocessing
- Remove some noise figures.
- Apply lemmatization then stemming
- Apply TFIDF

In [26]:
class text_processor():

    def __init__(self, max_features=None):
        self.vectorizer = TfidfVectorizer()
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        
        self.special = {'<br />':' ', "n't":' not '}
        self.digits = {' 0 ': ' zero ', ' 1 ': ' one ', ' 2 ':' two ', ' 3 ': ' three ', ' 4 ':' four ',
                  ' 5 ': ' five ', ' 6 ': ' six ', ' 7 ':' seven ', ' 8 ':' eight ', ' 9 ':' nine ', ' 10 ':' ten '}
        
        self.stop_words = set(['the','a','an','and', 'this','that','these','those','then','through','about','for','is','of','during'])

    def clean(self, text):
        text = str(text)

        for key, value in self.special.items():
            text = text.replace(key, value)
        for key, value in self.digits.items():
            text = text.replace(key, value)

        text = text.lower() 
        text = re.sub(r"[^a-z]", " ", text) #Get rid of emoticons 
        text = [self.stemmer.stem(self.lemmatizer.lemmatize(w)) for w in text.split() if not w in self.stop_words]
        text = ' '.join(text)

        return text


    def fit(self, series):
        text = series.apply(self.clean)
        self.vectorizer.fit(text)

    
    def transform(self, series):
        text = series.apply(self.clean)
        return self.vectorizer.transform(text)
    
    def fit_transform(self, series):
        text = series.apply(self.clean)
        self.vectorizer.fit(text)
        return self.vectorizer.transform(text)

In [27]:
p = text_processor()

#### Preprocess text

In [28]:
#for text in x_train['Text']:
 #   print("Type", type(text))
  #  print("\n")
   # print(text)
    #print("\n")
x_text = p.fit_transform(x_train['Text'])
xt_text = p.transform(x_test['Text'])

<class 'pandas.core.series.Series'>


100759    This product is no better than any other bottl...
312773    I gave this as a gift with Kuchen Meister Limo...
428583    My lab goes wild for these and I am almost tem...
221341    So a friend of mine from Japan who I have been...
280790    These peanut bars are the very best around. Th...
138851    Spilled my bottle on the floor and my dog lick...
560704    Oh, this is some good cat food!<br />Have you ...
165815    I ordered these beans back in December to make...
168484    Senseo recently released several flavored vari...
488507    I Bake these cookie and like them but I can do...
157255    I ENJOYED THE COMPLEATS CAUSE WAS QUICK AND EA...
427111    Up until this week, I have LOVED Rockstar Juic...
307750    Just bought this when we ran out of our usual ...
404514    This is a great cereal for gluten free diets o...
175786    For many months very few stores have been stoc...
400961    I obtained this product because we have cats a...
42

#### Preprocess summary

In [29]:
x_summary = p.fit_transform(x_train['Summary'])
xt_summary = p.transform(x_test['Summary'])

#### Sparse concatenation to add other features to the rather than text

In [30]:
from scipy.sparse import hstack

In [31]:
x = hstack([x_text, x_summary, x_train.Time.values.reshape(-1,1), x_train.HelpfulnessNumerator.values.reshape(-1,1), x_train.HelpfulnessDenominator.values.reshape(-1,1)])
xt = hstack([xt_text, xt_summary, x_test.Time.values.reshape(-1,1), x_test.HelpfulnessNumerator.values.reshape(-1,1), x_test.HelpfulnessDenominator.values.reshape(-1,1)])

#### Select the best features

In [32]:
# Seelecting 1000 features only as TFIDF produce high number of features
selector = SelectKBest(chi2, k=1000) ###Select the most highly correlated features based on chi2 (statistical test for a significance of a feature)
x_train = selector.fit_transform(x, y_train)
x_test = selector.transform(xt)

#### logistic regression without Kmeans or Gentic Algorithm

In [33]:
RF = LogisticRegression(solver="liblinear", multi_class="ovr", n_jobs=-1)
RF.fit(x_train, y_train)

pred = RF.predict(x_train)
print('Training accuracy', accuracy_score(y_train, pred))

pred = RF.predict(x_test)
print('Test accuracy', accuracy_score(y_test, pred))

Training accuracy 0.639568123846
Test accuracy 0.636555545782


#### logistic regression with Kmeans only

In [34]:
from sklearn.cluster import KMeans
# Taking 5 clusters through k means
n_clusters = len(np.unique(y_train))
clf = KMeans(n_clusters = n_clusters, random_state=42)
clf.fit(x_train)
y_labels_train = clf.labels_
y_labels_test = clf.predict(x_test)

# add the new labels as new features to the input
x_train = np.hstack([x_train.toarray(),y_labels_train.reshape((-1,1))]) ###We add the new k-means labels to the original features
x_test =  np.hstack([x_test.toarray(),y_labels_test.reshape((-1,1))])

RF2 = LogisticRegression(solver="liblinear", multi_class="ovr", n_jobs=-1)
RF2.fit(x_train, y_train)

pred = RF2.predict(x_train)
print('Training accuracy', accuracy_score(y_train, pred))

pred = RF2.predict(x_test)
print('Test accuracy', accuracy_score(y_test, pred))

Training accuracy 0.639568123846
Test accuracy 0.636555545782


#### logistic regression + GA feature selection

In [35]:
# Seelecting 1000 features only as TFIDF produce high number of features
selector = SelectKBest(chi2, k=1000)
x_train = selector.fit_transform(x, y_train) ####Test data must be hidden, during feature selection calculation 
#xv = selector.transform(xv)
x_test = selector.transform(xt) 

In [36]:
# taken from git+https://github.com/manuel-calzolari/sklearn-genetic.git

In [37]:
from genetic_selection import GeneticSelectionCV
RF3 = LogisticRegression(solver="liblinear", multi_class="ovr")
GA       = GeneticSelectionCV(RF3,
                              cv=5,
                              verbose=1,
                              scoring="accuracy",
                              max_features=20,
                              n_population=50,
                              crossover_proba=0.5,
                              mutation_proba=0.2,
                              n_generations=40,
                              crossover_independent_proba=0.5,
                              mutation_independent_proba=0.05,
                              tournament_size=3,
                              caching=True,
                              n_jobs=-1)
selector = GA.fit(x_train, y_train)

pred = GA.predict(x_train)
print('Training accuracy', accuracy_score(y_train, pred))

pred = GA.predict(x_test)
print('Test accuracy', accuracy_score(y_test, pred))

Selecting features with genetic algorithm.
gen	nevals	avg                  	std                        	min              	max              
0  	50    	[-10000.      505.46]	[  0.          14.13960395]	[-10000.    473.]	[-10000.    551.]
1  	34    	[-10000.      496.58]	[  0.          14.13236003]	[-10000.    458.]	[-10000.    542.]
2  	34    	[-10000.      487.04]	[  0.          12.27022412]	[-10000.    458.]	[-10000.    516.]
3  	23    	[-10000.      476.56]	[  0.          12.04518161]	[-10000.    454.]	[-10000.    505.]
4  	30    	[-10000.      465.32]	[  0.          10.74139656]	[-10000.    446.]	[-10000.    489.]
5  	32    	[-10000.      456.12]	[ 0.          7.64104705]  	[-10000.    436.]	[-10000.    476.]
6  	29    	[-10000.      451.64]	[ 0.          8.40180933]  	[-10000.    432.]	[-10000.    476.]
7  	34    	[-10000.      447.26]	[ 0.          7.11564474]  	[-10000.    428.]	[-10000.    466.]
8  	29    	[-10000.      442.92]	[ 0.          9.33775134]  	[-10000.    419.]	[-100

#### logistic regression + GA feature selection + Kmeans

In [38]:
# Selecting 1000 features only as TFIDF produce high number of features
selector = SelectKBest(chi2, k=1000)
x_train = selector.fit_transform(x, y_train)
x_test = selector.transform(xt)

# add the new labels of kmeans as new features to the input
x_train = np.hstack([x_train.toarray(),y_labels_train.reshape((-1,1))])
x_test =  np.hstack([x_test.toarray(),y_labels_test.reshape((-1,1))])

In [39]:
from scipy import sparse

In [40]:
x_train = sparse.csr_matrix(x_train)
x_test = sparse.csr_matrix(x_test)

In [41]:
selector = GA.fit(x_train, y_train)

pred = GA.predict(x_train)
print('Training accuracy', accuracy_score(y_train, pred))

pred = GA.predict(x_test)
print('Test accuracy', accuracy_score(y_test, pred))

Selecting features with genetic algorithm.
gen	nevals	avg                  	std                        	min              	max              
0  	50    	[-10000.      500.72]	[  0.          13.10731094]	[-10000.    469.]	[-10000.    529.]
1  	35    	[-10000.     488.9]  	[  0.          14.51654229]	[-10000.    451.]	[-10000.    516.]
2  	24    	[-10000.      477.28]	[  0.          13.14844477]	[-10000.    451.]	[-10000.    518.]
3  	34    	[-10000.      466.48]	[  0.          11.66917306]	[-10000.    440.]	[-10000.    492.]
4  	26    	[-10000.      457.94]	[ 0.          8.59164711]  	[-10000.    437.]	[-10000.    475.]
5  	24    	[-10000.      451.86]	[ 0.          9.31452629]  	[-10000.    425.]	[-10000.    474.]
6  	31    	[-10000.      446.52]	[ 0.          9.66072461]  	[-10000.    425.]	[-10000.    472.]
7  	27    	[-10000.      439.88]	[ 0.          7.46361843]  	[-10000.    425.]	[-10000.    457.]
8  	33    	[-10000.      435.02]	[ 0.          7.31160721]  	[-10000.    421.]	[-100

In [42]:
#def comment_to_rating(text, GA):
 #   text = [text]
  #  text = pd.Series(text)
   # print(type(text))
    #print(text)
    #pre_process = text_processor()
    #x_text = pre_process.transform(text)
    #return GA.predict(x_text)
#text = "I’ve been buying ACV for several months both from amazon and other stores but the order I received today is either not the real apple cider vinegar or is a faulty batch. The colour is different and much darker. The taste is different and tastes more like malt vinegar. And the mother should be strands of strings not clump of sand. Not impressed and will cancel my subscription for this item."
#print("Rating", comment_to_rating(text, GA))