# importando bibliotecas necessárias

In [18]:
# Data Structures
import numpy  as np
import pandas as pd

# Corpus Processing
import re
import nltk.corpus
from unidecode import unidecode
from nltk.tokenize import word_tokenize
import joblib
from nltk import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import normalize
import shutil
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import os
import logging

In [20]:
logging.basicConfig(level=logging.WARNING,
                    format='%(asctime)s %(message)s',
                    handlers=[logging.FileHandler("model.log"),
                              logging.StreamHandler()])

In [23]:
json_path = r'../scrapping_with_scrapy/scrapping_with_scrapy/spiders/post.json'
json_copy_to = 'data/'
try:
    shutil.copy(json_path, json_copy_to)
except Exception as e:
    logging.warning("'\033[91m' não foi possível copiar o arquivo json'\033[0m'")
df = pd.read_json('data/post.json')
df.dropna(inplace=True)

df['plataform'] = df['plataform'].str.replace('\n', '')
df['sumary'] = df['sumary'].str.replace('\n', '')

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19722 entries, 0 to 39245
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         19722 non-null  object 
 1   plataform     19722 non-null  object 
 2   release_date  19722 non-null  object 
 3   rate          19722 non-null  float64
 4   sumary        19722 non-null  object 
dtypes: float64(1), object(4)
memory usage: 924.5+ KB


In [46]:
df.describe()

Unnamed: 0,rate
count,19722.0
mean,70.696684
std,12.197481
min,11.0
25%,64.0
50%,72.0
75%,80.0
max,99.0


In [25]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anton\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anton\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [31]:
corpus = df['sumary'].tolist()

In [32]:
# removes a list of words (ie. stopwords) from a tokenized list.
def removeWords(listOfTokens, listOfWords):
    return [token for token in listOfTokens if token not in listOfWords]

# applies stemming to a list of tokenized words
def applyStemming(listOfTokens, stemmer):
    return [stemmer.stem(token) for token in listOfTokens]

# removes any words composed of less than 2 or more than 21 letters
def twoLetters(listOfTokens):
    twoLetterWord = []
    for token in listOfTokens:
        if len(token) <= 2 or len(token) >= 21:
            twoLetterWord.append(token)
    return twoLetterWord

In [33]:
def processCorpus(corpus):
    language = 'english'
    stopwords = nltk.corpus.stopwords.words(language)
    param_stemmer = SnowballStemmer(language)
    other_words = [line.rstrip('\n') for line in open('lists/stopwords_scrapmaker.txt', encoding="utf-8")] # Load .txt file line by line
    

    corpus = corpus.replace(u'\ufffd', '8')   # Replaces the ASCII '�' symbol with '8'
    corpus = corpus.replace(',', '')          # Removes commas
    corpus = corpus.rstrip('\n')              # Removes line breaks
    corpus = corpus.casefold()                # Makes all letters lowercase
    
    corpus = re.sub('\W_',' ', corpus)        # removes specials characters and leaves only words
    corpus = re.sub("\S*\d\S*"," ", corpus)   # removes numbers and words concatenated with numbers IE h4ck3r. Removes road names such as BR-381.
    corpus = re.sub("\S*@\S*\s?"," ", corpus) # removes emails and mentions (words with @)
    corpus = re.sub(r'http\S+', '', corpus)   # removes URLs with http
    corpus = re.sub(r'www\S+', '', corpus)    # removes URLs with www

    listOfTokens = word_tokenize(corpus)
    twoLetterWord = twoLetters(listOfTokens)

    listOfTokens = removeWords(listOfTokens, stopwords)
    listOfTokens = removeWords(listOfTokens, twoLetterWord)
    listOfTokens = removeWords(listOfTokens, other_words)
    
    listOfTokens = applyStemming(listOfTokens, param_stemmer)
    listOfTokens = removeWords(listOfTokens, other_words)

    corpus   = " ".join(listOfTokens)
    corpus = unidecode(corpus)

    return corpus

In [34]:
model = Pipeline([
    ('bow', CountVectorizer(analyzer=processCorpus)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('regressor', LinearRegression())
])

In [37]:
X_train, X_test, y_train, y_test = train_test_split(df['sumary'].values.astype('U'), df['rate'].values.astype('U'), random_state=42, test_size=0.3)

In [38]:
model.fit(X_train, y_train)

In [39]:
predicted = model.predict(X_test)

In [42]:
evaluating = pd.DataFrame({
    "description":X_test,
    "rating":y_test,
    "predicted":predicted
})
evaluating

Unnamed: 0,description,rating,predicted
0,Reservoir Dogs is base...,50.0,69.711156
1,Set in a gothic-noir u...,59.0,70.673879
2,Large and lovable Moig...,64.0,70.627724
3,Sonic Mania Plus is th...,91.0,70.684408
4,Maquette is a first-p...,70.0,71.179336
...,...,...,...
5912,Enter the mind of The ...,69.0,69.058406
5913,The story is set sever...,89.0,70.667214
5914,The story begins in 19...,67.0,69.315023
5915,"Well, Agent, it looks ...",83.0,71.704660


In [41]:
joblib.dump(model, 'model.joblib')

['model.joblib']