In [69]:
import re
import pandas as pd
import nltk
from collections import Counter
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import math

RE_SPACES = re.compile("\s+")
RE_HASHTAG = re.compile("[@#][_a-z0-9]+")
RE_EMOTICONS = re.compile("(:-?\))|(:p)|(:d+)|(:-?\()|(:/)|(;-?\))|(<3)|(=\))|(\)-?:)|(:'\()|(8\))")
RE_HTTP = re.compile("http(s)?://[/\.a-z0-9]+")

PRINT_EVERY = 10000

In [101]:
train = pd.read_csv('train.csv', sep=',')
train = train.drop(['Id','Product Name', 'Brand Name', 'Price'], axis = 1)
train = train.replace(np.nan, '', regex=True)

X = train.drop('Rating', axis = 1)
y = train.Rating
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.1, random_state = 0)

In [4]:
test = pd.read_csv("test.csv", sep=",")
test = test.drop(['Product Name', 'Brand Name', 'Price'], axis = 1)
test = test.replace(np.nan, '', regex=True)
test.head(10)

Unnamed: 0,Id,Reviews
0,202646,Excelent
1,202647,Fit perfectly. Excelent.
2,202648,"It has a very good relation, price/performance."
3,202649,Like it a lot
4,202650,"Dear , i had a problem wich the wifi senial !!..."
5,202651,Received ok. Very good option
6,202652,It was expected. excellent presentation!
7,202653,excelente 100% recomendado
8,202654,"We use it in Argentine. It works fantastic, fa..."
9,202655,Excellent Product. Excellent Vendor. I truly r...


In [5]:
class Tokenizer():
    @staticmethod
    def tokenize(text):
        pass
    
class BeforeTokenizationNormalizer():
    @staticmethod
    def normalize(text):
        text = text.strip().lower()
        text = text.replace('&nbsp;', ' ')
        text = text.replace('&lt;', '<')
        text = text.replace('&gt;', '>')
        text = text.replace('&amp;', '&')
        text = text.replace('&pound;', u'£')
        text = text.replace('&euro;', u'€')
        text = text.replace('&copy;', u'©')
        text = text.replace('&reg;', u'®')
        return text
    
class SimpleTokenizer(Tokenizer):
    @staticmethod
    def tokenize(text):
        return re.split(RE_SPACES, text)

class NltkTokenizer(Tokenizer):
    @staticmethod
    def tokenize(text):
        return nltk.tokenize.word_tokenize(text)
    
class CellTokenizer(Tokenizer):
    @staticmethod
    def tokenize(text):
        tokens = SimpleTokenizer.tokenize(text)
        i = 0
        while i < len(tokens):
            token = tokens[i]
            match = [RE_HASHTAG, RE_EMOTICONS, RE_HTTP]
            tokenmatch = False
            if match[0].search(token) is not None:
                tokenmatch = True
            if match[1].search(token) is not None:
                tokenmatch = True
            if match[2].search(token) is not None:
                tokenmatch = True
            if not tokenmatch:
                del tokens[i]
                tokens[i:i] = NltkTokenizer.tokenize(token)
            i += 1
            
        ps = nltk.stem.PorterStemmer()
        newTokens = []
        for t in tokens:
            newTokens.append(ps.stem(t))
        return newTokens

In [43]:
words = Counter()

print(len(X_train))
i = 0
for index, row in X_train.iterrows():
    if not i % PRINT_EVERY: print(str(i) + '/' + str(len(X_train)))
    i = i + 1
    t = BeforeTokenizationNormalizer.normalize(row['Reviews'])
    words.update(CellTokenizer.tokenize(t))

182380
0/182380
2000/182380
4000/182380
6000/182380
8000/182380
10000/182380
12000/182380
14000/182380
16000/182380
18000/182380
20000/182380
22000/182380
24000/182380
26000/182380
28000/182380
30000/182380
32000/182380
34000/182380
36000/182380
38000/182380
40000/182380
42000/182380
44000/182380
46000/182380
48000/182380
50000/182380
52000/182380
54000/182380
56000/182380
58000/182380
60000/182380
62000/182380
64000/182380
66000/182380
68000/182380
70000/182380
72000/182380
74000/182380
76000/182380
78000/182380
80000/182380
82000/182380
84000/182380
86000/182380
88000/182380
90000/182380
92000/182380
94000/182380
96000/182380
98000/182380
100000/182380
102000/182380
104000/182380
106000/182380
108000/182380
110000/182380
112000/182380
114000/182380
116000/182380
118000/182380
120000/182380
122000/182380
124000/182380
126000/182380
128000/182380
130000/182380
132000/182380
134000/182380
136000/182380
138000/182380
140000/182380
142000/182380
144000/182380
146000/182380
148000/182380
1

In [44]:
len(words)

52875

In [45]:
ignore = [',', '-', ';', "''", '...','.','!', '?', '(', ')', '``', ':']
stopwords = ["a", "about", "after", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been",
            "before", "being", "between", "both", "by", "could", "did", "do", "does", "doing", "during", "each",
            "for", "from", "further", "had", "has", "have", "having", "he", "her", "here", "hers", "herself", "him",
            "himself", "his", "how", "i", "in", "into", "is", "it", "its", "itself", "let", "me", "more", "most", "my",
            "myself", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "own", "sha",
            "she", "should", "so", "some", "such", "than", "that", "the", "their", "theirs", "them", "themselves",
            "then", "there", "there's", "these", "they", "this", "those", "through", "to", "until", "up", "very",
            "was", "we", "were", "what", "when", "where", "which", "while", "who","whom", "with", "would", "you",
            "your", "yours", "yourself", "yourselves",
            "n't", "'s", "'ll", "'re", "'d", "'m", "'ve",
            "above", "again", "against", "below", "but", "down", "few", "if", "nor", "off",
            "out", "over", "same", "too", "under", "why"]
#"cannot", "no", "not"
for word in list(words):
    if word in ignore:
        del words[word]
    elif word in stopwords:
        del words[word]
        
#words.most_common(50)

In [112]:
def create_bow(X, features):
    row = []
    col = []
    data = []

    i = 0
    for index, rows in X.iterrows():
        if not i % PRINT_EVERY: print(str(i) + '/' + str(len(X)))
        
        review = BeforeTokenizationNormalizer.normalize(rows['Reviews'])
        review_tokens = CellTokenizer.tokenize(review)
        for token in set(review_tokens):
            if token not in features:
                continue
            row.append(i)
            col.append(features[token])
            data.append(1)
        i = i + 1
           
    return csr_matrix((data, (row, col)), shape=(len(X), len(features)))

In [84]:
min_word_count = 10

common_words = list([k for k, v in words.most_common() if v > min_word_count])
feature_dict = {}
for word in common_words:
    feature_dict[word] = len(feature_dict)

print("Training classifier...")
X_train_bow = create_bow(X_train, feature_dict)

Training classifier...
0/182380
10000/182380
20000/182380
30000/182380
40000/182380
50000/182380
60000/182380
70000/182380
80000/182380
90000/182380
100000/182380
110000/182380
120000/182380
130000/182380
140000/182380
150000/182380
160000/182380
170000/182380
180000/182380


In [85]:
classifier = LinearRegression(n_jobs=-1)
classifier.fit(X_train_bow, y_train_bow)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)

In [None]:
print("Validating...")
X_valid_bow = create_bow(X_valid, feature_dict)

In [97]:
predicted_valid = classifier.predict(X_valid_bow)
predicted_valid = np.around(predicted)
predicted_valid = predicted.astype(int)
predicted_valid[predicted_valid > 5] = 5
predicted_valid[predicted_valid < 1] = 1

print("=================== Results ===================")
print("RMSE ", math.sqrt(mean_squared_error(y_valid, predicted_valid)))

RMSE  0.9897077054442097


In [113]:
print("Testing...")
X_test = create_bow(test, feature_dict)
predicted_test = classifier.predict(X_test)
predicted_test = np.around(predicted)
predicted_test = predicted_test.astype(int)

predicted_test[predicted_test > 5] = 5
predicted_test[predicted_test < 1] = 1
test['Rating'] = predicted_test
test.to_csv("submission.csv", sep=",", columns = ['Id', 'Rating'], index = False)

Testing...
0/211195
10000/211195
20000/211195
30000/211195
40000/211195
50000/211195
60000/211195
70000/211195
80000/211195
90000/211195
100000/211195
110000/211195
120000/211195
130000/211195
140000/211195
150000/211195
160000/211195
170000/211195
180000/211195
190000/211195
200000/211195
210000/211195
