In [1]:
#import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re  # Librería de expresiones regulares
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer 
# Crea bolsa de palabras y vectoriza (califica con valores numérico palabras del lenguaje ordinario)
from sklearn.ensemble import RandomForestClassifier  
from sklearn import svm # Importando maquinas de soporte vectorial 
from sklearn.linear_model import LogisticRegression

In [2]:
RandomForestClassifier

sklearn.ensemble._forest.RandomForestClassifier

In [3]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
from nltk.corpus import stopwords

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


In [4]:
data = pd.read_csv('Reviews.csv')[['Score','Summary','Text']]

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   Score    568454 non-null  int64 
 1   Summary  568427 non-null  object
 2   Text     568454 non-null  object
dtypes: int64(1), object(2)
memory usage: 13.0+ MB


### Sampling and data cleaning

In [6]:
df = data.sample(frac=0.2) # Toma el 20% de los datos de forma aleatoria

In [7]:
dicc = {'5':3,'4':3,'3':2,'2':1,'1':1}
df.Score =df.Score.astype(str).map(dicc)

In [8]:
df.dropna(inplace=True)

In [9]:
# Compensando las clases de los datos
n = df['Score'].value_counts().min()
df = df.groupby('Score').head(n)

In [10]:
df[df.Summary.isna()]

Unnamed: 0,Score,Summary,Text


In [11]:
df['Score'].value_counts()

3    8455
1    8455
2    8455
Name: Score, dtype: int64

In [12]:
# Separando las variables predictoras y la variable objetivo
X = df.Summary.reset_index(drop=True)
y = df.Score.reset_index(drop=True)

In [13]:
from nltk.stem import WordNetLemmatizer
documents = []

#from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

for sen in range(0, len(X)):
    # Remove all the special characters -
    document = re.sub(r'W', ' ', str(X[sen]))
    
    # remove all single characters -
    document = re.sub(r's+[a-zA-Z]s+', ' ', document)
    
    # Remove single characters from the start -
    document = re.sub(r'^[a-zA-Z]s+', ' ', document) 
    
    # Substituting multiple spaces with single space -
    document = re.sub(r's+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'    ?
    document = re.sub(r'^bs+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document=" ".join(document)
    
    documents.append(document)

### Bag of words and vectorization

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfivec = TfidfVectorizer(max_features=1500, min_df= 25, max_df= 0.33,
                        stop_words= stopwords.words('english'))
X = tfivec.fit_transform(documents).toarray()

### Test and training

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X ,y, test_size=0.3, random_state= 5)

### Classification

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(X_train, y_train)

In [16]:
clf_log = LogisticRegression(random_state= 20)
clf_log.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=20)

In [17]:
classifier = RandomForestClassifier(n_estimators=100, random_state=20)
classifier.fit(X_train, y_train) 

RandomForestClassifier(random_state=20)

In [18]:
y_pred = clf_log.predict(X_test)
y_predF = classifier.predict(X_test)

### Evaluation

In [19]:
from sklearn.metrics import f1_score
print(clf_log.score(X_test, y_test))
print(f1_score(y_test, y_pred, average=None))

0.5879106438896189
[0.59920058 0.51109215 0.64717582]


In [20]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import f1_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[1649  535  344]
 [ 827 1198  506]
 [ 500  424 1627]]
              precision    recall  f1-score   support

           1       0.55      0.65      0.60      2528
           2       0.56      0.47      0.51      2531
           3       0.66      0.64      0.65      2551

    accuracy                           0.59      7610
   macro avg       0.59      0.59      0.59      7610
weighted avg       0.59      0.59      0.59      7610

0.5879106438896189


In [21]:
print(confusion_matrix(y_test,y_predF))
print(classification_report(y_test,y_predF))
print(accuracy_score(y_test, y_predF))

[[1663  511  354]
 [ 749 1316  466]
 [ 526  407 1618]]
              precision    recall  f1-score   support

           1       0.57      0.66      0.61      2528
           2       0.59      0.52      0.55      2531
           3       0.66      0.63      0.65      2551

    accuracy                           0.60      7610
   macro avg       0.61      0.60      0.60      7610
weighted avg       0.61      0.60      0.60      7610

0.6040735873850197


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=9ac2f155-fc59-4dad-9a76-59ef87abc2f6' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>