In [1]:
import pandas as pd 
import numpy as np
import preprocess_kgptalkie as ps 
import re 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv('Train_Data.csv', usecols = ['customer_reviews','average_review_rating'])
df.head()

Unnamed: 0,customer_reviews,average_review_rating
0,Item was broken // 3.0 // 2 April 2014 // By\n...,4.5
1,A wonderful collectors piece // 5.0 // 31 Aug....,4.6
2,Great sticker collection // 5.0 // 19 Jun. 201...,4.7
3,Rock and roll cards // 5.0 // 28 July 2013 // ...,5.0
4,Little fingers big imagination // 5.0 // 4 Mar...,4.2


In [3]:
df['average_review_rating'].value_counts()

5.0    3870
4.0     999
4.5     538
4.8     425
4.7     389
4.3     299
4.6     282
4.4     234
4.9     173
4.2     159
4.1     106
3.9       3
3.3       2
3.6       2
3.5       2
3.7       2
3.0       1
Name: average_review_rating, dtype: int64

In [4]:
def get_clean(x):
    x = str(x).lower().replace('\\', '').replace('_', ' ')
    x = ps.cont_exp(x)
    x = ps.remove_emails(x)
    x = ps.remove_urls(x)
    x = ps.remove_rt(x)
    x = ps.remove_accented_chars(x)
    x = ps.remove_special_chars(x)
    x = re.sub("(.)\\1{2,}", "\\1", x)
    return x

In [5]:
df['customer_reviews']=df['customer_reviews'].apply(lambda x: get_clean(x))

In [6]:
df.head()

Unnamed: 0,customer_reviews,average_review_rating
0,item was broken 30 2 april 2014 by susie q on ...,4.5
1,a wonderful collectors piece 50 31 aug 2013 by...,4.6
2,great sticker collection 50 19 jun 2011 by f s...,4.7
3,rock and roll cards 50 28 july 2013 by tricia ...,5.0
4,little fingers big imagination 50 4 mar 2014 b...,4.2


In [7]:
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,3), analyzer='char')

In [8]:
x = tfidf.fit_transform(df['customer_reviews'])
y = df['average_review_rating']

In [9]:
x.shape, y.shape

((7486, 10000), (7486,))

In [10]:
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.2, random_state=42)

In [11]:
xtrain.shape

(5988, 10000)

In [12]:
reg = SVR()
reg.fit(xtrain,ytrain)

SVR()

In [13]:
y_pred = reg.predict(xtest)
test_rmse_score = mean_squared_error(ytest, y_pred, squared=False)
train_pred=reg.predict(xtrain)
train_rmse_score = mean_squared_error(ytrain, train_pred, squared=False)

In [14]:
print(f'The RMSE SCORE for training data is {train_rmse_score} and for test is {test_rmse_score}')

The RMSE SCORE for training data is 0.1426648611436532 and for test is 0.20198889787978735


In [15]:
testdf = pd.read_csv("test.csv", usecols = ['customer_reviews'] )
testdf.head()

Unnamed: 0,customer_reviews
0,Dolls house miniatures // 5.0 // 9 Jan. 2013 /...
1,Tangled // 4.0 // 16 May 2011 // By\n \n ...
2,Okay but that's all // 1.0 // 11 July 2011 // ...
3,Good quality but take down carefully as the fo...
4,hilarious // 5.0 // 11 Dec. 2013 // By\n \n...


In [16]:
testdf['customer_reviews']=testdf['customer_reviews'].apply(lambda x: get_clean(x))

In [17]:
testdf.head()

Unnamed: 0,customer_reviews
0,dolls house miniatures 50 9 jan 2013 by eileen...
1,tangled 40 16 may 2011 by jenna on 16 may 2011...
2,okay but that is all 10 11 july 2011 by h vinc...
3,good quality but take down carefully as the fo...
4,hilarious 50 11 dec 2013 by lauramcv on 11 dec...


In [18]:
testdf.shape

(2496, 1)

In [20]:
tfidf1 = TfidfVectorizer(max_features=10000, ngram_range=(1,4), analyzer='char')
x = tfidf1.fit_transform(testdf['customer_reviews'])
submission=reg.predict(x)

In [21]:
submission

array([4.80533003, 4.819065  , 4.76732863, ..., 4.81560264, 4.80409939,
       4.83109728])

In [22]:
subdf = pd.DataFrame(submission)
subdf.to_csv('sub_1.csv')
