In [95]:
import pandas as pd
import numpy as np
import spacy
import regex as re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.ensemble import (RandomForestClassifier, 
                              GradientBoostingClassifier,
                              ExtraTreesClassifier)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import (MultinomialNB, 
                                 GaussianNB)
from sklearn.model_selection import train_test_split
from sklearn.base import (BaseEstimator, 
                          TransformerMixin)
from sklearn.feature_extraction.text import (CountVectorizer,
                                             TfidfVectorizer)
from sklearn.pipeline import Pipeline

#Evaluation
from sklearn import metrics
from yellowbrick.classifier import (roc_auc, 
                                    confusion_matrix, 
                                    classification_report)

import warnings
warnings.filterwarnings("ignore")

In [38]:
#loading english large model for Word vectorazing
nlp=spacy.load('en_core_web_lg')

# List of stopwords

In [60]:
nltk.download("stopwords")
nltk.download("punkt")
my_stop_words=stopwords.words("english")

words2keep=["don", "don't", "ain", "aren", "aren't", "couldn", "couldn't", "didn",
"didn't", "doesn", "doesn't", "hadn", "hadn't", "hasn", "hasn't", "haven", "haven't", "isn", "isn’t", "ma",
"mightn", "mightn't", "mustn", "mustn't", "needn", "needn't", "shan", "shan't", "no", "nor", "not", "shouldn'",
"shouldn't", "wasn'", "wasn't", "weren'", "weren't", "won", "won't", "wouldn'", "wouldn't"]
my_stop_words=[word for word in my_stop_words if word not in words2keep]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chenyenpin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/chenyenpin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [61]:
# Main functions for the project

def eda(df:pd.DataFrame):
  '''
  Retornara um analise exploratoria simples

  -param df: pandas dataframe
  '''
  print("-"*30,"DataFrame Shape", "-"*30)
  print(df.shape)
  print("-"*30,"DataFrame info", "-"*30)
  print(df.info())
  print("-"*30,"DataFrame Nan", "-"*30)
  print(df.isna().sum())
  print("-"*30,"DataFrame duplicated", "-"*30)
  print(df.duplicated().sum())
  print("-"*30,"DataFrame nunique", "-"*30)
  print(df.nunique())
  print("-"*30,"DataFrame value counts", "-"*30)
  print(f"{df['rating_seg'].value_counts(normalize=True)}")



def rating_seg(df: pd.DataFrame)->pd.DataFrame:
  conditions=[
    (df["Rating"]<=2),
    (df["Rating"]>3),
    (df["Rating"]==3)
  ]
  values=[0,2,1]
  df["rating_seg"]=np.select(conditions, values, default=np.nan)
  return df

def remove_stop_words(column):
  tokenize=word_tokenize(column)
  column_withno_stopwords=[token for token in tokenize  if not token in my_stop_words ]
  new_sentence=" ".join(column_withno_stopwords)
  return new_sentence


def convert_lower_case(column):
  '''
  Funcao que retorna a coluna selecionada para letra minuscula
  -param column: coluna da pd.Dataframe
  '''
  return column.lower()


def spacy_tokenizer(doc):
  '''
  Funcao que remove as pontuacoes, novas linhas lematiza nosso texto e apenas letras do alfabeto
  '''
  unwanted_pipes = ["ner", "parser"]
  with nlp.disable_pipes(*unwanted_pipes):
    return " ".join([t.lemma_ for t in nlp(doc) if \
            not t.is_punct and \
            not t.is_space and \
            t.is_alpha])

In [80]:
#Import spotify review
df=pd.read_csv("/Users/chenyenpin/Documents/reviews_feedback/dataset/reviews.csv",
               usecols=["Review","Rating"])
df.head()

Unnamed: 0,Review,Rating
0,"Great music service, the audio is high quality...",5
1,Please ignore previous negative rating. This a...,5
2,"This pop-up ""Get the best Spotify experience o...",4
3,Really buggy and terrible to use as of recently,1
4,Dear Spotify why do I get songs that I didn't ...,1


# Shuffle dataset

In [81]:
df_shuffle=df.sample(frac=1, random_state=101).copy()
df_shuffle

Unnamed: 0,Review,Rating
28812,The problems with Lisa's songs have happened t...,1
8989,It was good but there was a lot of ads.this ha...,4
57107,Thank you Spotify for supporting free speech a...,5
46461,I use Spotify all the time I listen to music p...,1
14434,Love the way you can add different playlists. ...,5
...,...,...
5695,Its work great. I highly recommend spotify add...,2
8006,The search bar keeps telling me it can't find ...,3
17745,Very good quality of product 👌,4
17931,Best playlist ever🤝,5


In [82]:
#rating segmentation 0->negative, 1->neutral, 2->good
df_shuffle=rating_seg(df_shuffle)

In [84]:
eda(df_shuffle)

------------------------------ DataFrame Shape ------------------------------
(61594, 3)
------------------------------ DataFrame info ------------------------------
<class 'pandas.core.frame.DataFrame'>
Index: 61594 entries, 28812 to 45919
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Review      61594 non-null  object 
 1   Rating      61594 non-null  int64  
 2   rating_seg  61594 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 1.9+ MB
None
------------------------------ DataFrame Nan ------------------------------
Review        0
Rating        0
rating_seg    0
dtype: int64
------------------------------ DataFrame duplicated ------------------------------
201
------------------------------ DataFrame nunique ------------------------------


Review        61356
Rating            5
rating_seg        3
dtype: int64
------------------------------ DataFrame value counts ------------------------------
rating_seg
2.0    0.486038
0.0    0.402166
1.0    0.111797
Name: proportion, dtype: float64


In [85]:
df_shuffle["Review"][10]

'Love Spotify, and usually this app is the best, but as others have stated; the control buttons disappear and podcasts stop for no reason, which is only fixed by deleting and reinstalling the app. First world problems for sure, but highly annoying nonetheless, especially when paying for prime.'

# Data Cleaning / Preparation

## Lower case

In [86]:
df_shuffle["Review"]=df_shuffle["Review"].apply(lambda x :convert_lower_case(x))

## Removing stopwords



In [87]:
df_shuffle["review_clean"]=df_shuffle["Review"].apply(lambda x : remove_stop_words(x))
print(len(df_shuffle["Review"][10]))
print(len(df_shuffle["review_clean"][10]))

293
219


## Lemmatization and clean the punctuation

In [88]:
df_shuffle["review_clean"]=df_shuffle["Review"].apply(lambda x: spacy_tokenizer(x))


In [89]:
df_shuffle.head()

Unnamed: 0,Review,Rating,rating_seg,review_clean
28812,the problems with lisa's songs have happened t...,1,0.0,the problem with lisa song have happen twice a...
8989,it was good but there was a lot of ads.this ha...,4,2.0,it be good but there be a lot of have a lot of...
57107,thank you spotify for supporting free speech a...,5,2.0,thank you spotify for support free speech and ...
46461,i use spotify all the time i listen to music p...,1,0.0,I use spotify all the time I listen to music p...
14434,love the way you can add different playlists. ...,5,2.0,love the way you can add different playlist de...


'very good quality of product'

In [93]:
class SpacyTokenizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [self.tokenize(doc) for doc in X]

    def tokenize(self, doc):
        '''
        Function that removes punctuation, new lines, lemmatizes our text, and keeps only alphabetical letters
        '''
        unwanted_pipes = ["ner", "parser"]
        with self.nlp.disable_pipes(*unwanted_pipes):
            return " ".join([t.lemma_ for t in self.nlp(doc) if \
                             not t.is_punct and \
                             not t.is_space and \
                             t.is_alpha])
