# 1. Importing Libraries

In [33]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import json
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
import numpy as np
from matplotlib import pyplot

In [2]:
# load snaowball Stemmer
sbEsp = SnowballStemmer('spanish')

In [3]:
def preprocessing(scentence):
    #We only want to work with lowercase for the comparisons
    scentence = scentence.lower()

    #remove punctuation and split into seperate words
    words = re.findall(r'\w+', scentence,flags = re.UNICODE)
    return ' '.join([sbEsp.stem(item) for item in words])

# 2. Read Dataset

In [28]:
with open('reviews.json', encoding = 'utf-8') as json_file:
    data = json.load(json_file)

# 3. Preparing the Data

#### 3.1 read the needed columns

In [29]:
dataset = []
for paper in data['paper']:
    for review in paper['review']:
        if review['lan'] == 'es' and review['text']:
            # preprocessing the text
            text = preprocessing(review['text'])
            dataset.append([review['evaluation'], text])

In [30]:
dataframe = pd.DataFrame(dataset, columns=["evaluation", "text"])

In [31]:
dataframe.head(10)

Unnamed: 0,evaluation,text
0,1,el articul abord un problem contingent y muy r...
1,1,el articul present recomend practic par el des...
2,1,el tem es muy interes y pued ser de much ayud ...
3,2,se explic en form orden y didact una experient...
4,2,los autor describ una metodolog par desarroll ...
5,2,este trabaj propon un nuev enfoqu bas en 25 pa...
6,2,se realiz un trabaj de model de encript cuanti...
7,-2,este pap involucr el desarroll de una aplic qu...
8,2,noved propuest bien estructur correct escrit s...
9,2,el tem es interes y esta bien estructur en lo ...


In [34]:
stopwords = set(stopwords.words('spanish'))

#### 3.2 Convert the text to numerical data using Unigram bag-of-words

In [35]:
vectorizer = CountVectorizer(stop_words=stopwords)
vectorized = vectorizer.fit_transform(dataframe['text'])

In [36]:
feature_names = vectorizer.get_feature_names()

In [37]:
X = vectorized.toarray()

In [38]:
y = dataframe.evaluation
y

0       1
1       1
2       1
3       2
4       2
       ..
377    -1
378    -1
379    -1
380     1
381     1
Name: evaluation, Length: 382, dtype: object

#### 3.3 split the dataset into Train and Test set

In [39]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# 4. Training the LinearRegression

In [40]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

# 5. Making Predictions

In [50]:
print("Length of Test data is : {}".format(len(X_test)))

Length of Test data is : 77


In [41]:
y_pred = regressor.predict(X_test)

In [53]:
# Linear Regression predicts the evaluation value from the test set
# actual column is the same as evaluation column in Dataset
df = pd.DataFrame({'Actual Review': y_test, 'Predicted Review': y_pred})
df

Unnamed: 0,Actual Review,Predicted Review
281,-2,-4.366314
122,1,0.332616
353,1,-0.657565
324,-2,-2.982346
173,2,0.687870
...,...,...
238,1,0.692250
240,-1,-0.946365
235,0,0.849922
56,0,0.760405


# 6. Evaluating the Algorithm

In [43]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 1.2950624128928987
Mean Squared Error: 3.3903031130826737
Root Mean Squared Error: 1.8412775763264684


# 7. Give the most important 10 features

#### 7.1 Display the most important 10 features for positive review

In [56]:
print("The most important 10 features for positive review:\n")
for i in (regressor.coef_).argsort()[::-1][:10]:
    print("Feature : {} \t, Importance : {}".format(feature_names[i], np.abs(regressor.coef_[i])))

The most important 10 features for positive review:

Feature : gust 	, Importance : 0.42371618721324383
Feature : clarid 	, Importance : 0.3702371131821889
Feature : buen 	, Importance : 0.3568649684556298
Feature : motiv 	, Importance : 0.33958112423873155
Feature : previ 	, Importance : 0.334299071646675
Feature : mostr 	, Importance : 0.31266237188409446
Feature : actual 	, Importance : 0.3100917070215693
Feature : adjunt 	, Importance : 0.3094197399507518
Feature : tal 	, Importance : 0.30760950003073095
Feature : caracterist 	, Importance : 0.2899963351039534


#### 7.2 Display the most important 10 features for negative review

In [55]:
print("The most important 10 features for negative review:\n")
for i in (-regressor.coef_).argsort()[::-1][:10]:
    print("Feature : {} \t, Importance : {}".format(feature_names[i], np.abs(regressor.coef_[i])))

The most important 10 features for negative review

Feature : nul 	, Importance : 0.6786741705609403
Feature : exist 	, Importance : 0.5753291642011342
Feature : cientif 	, Importance : 0.5743425043181307
Feature : aport 	, Importance : 0.5113338865125555
Feature : sid 	, Importance : 0.44225134472246863
Feature : solucion 	, Importance : 0.4160958260530073
Feature : explicit 	, Importance : 0.4107089714098794
Feature : format 	, Importance : 0.38188972616656713
Feature : investig 	, Importance : 0.3779468566636736
Feature : mal 	, Importance : 0.3622090953003124
