## Yelp Reviews Sentiment Predictor
### Developer - Ashima Munjal

In [1]:
# Loading dependencies.
import pandas as pd
import time
import re
import pandas as pd
import pickle
import numpy as np

# Natural Language Processing Libraries.
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob
from nltk.stem import PorterStemmer

# Machine Learning Libraries.
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier

#### Loading Reviews Data.
#### The dataset for this project was taken from [Kaggle](https://www.kaggle.com/yelp-dataset/yelp-dataset).

In [2]:
strt_time = time.time()
data = pd.read_csv("yelp_data\\yelp_review.csv")
data.head()

Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool
0,vkVSCC7xljjrAI4UGfnKEQ,bv2nCi5Qv5vroFiqKGopiw,AEx2SYEUJmTxVVB18LlCwA,5,2016-05-28,Super simple place but amazing nonetheless. It...,0,0,0
1,n6QzIUObkYshz4dz2QRJTw,bv2nCi5Qv5vroFiqKGopiw,VR6GpWIda3SfvPC-lg9H3w,5,2016-05-28,Small unassuming place that changes their menu...,0,0,0
2,MV3CcKScW05u5LVfF6ok0g,bv2nCi5Qv5vroFiqKGopiw,CKC0-MOWMqoeWf6s-szl8g,5,2016-05-28,Lester's is located in a beautiful neighborhoo...,0,0,0
3,IXvOzsEMYtiJI0CARmj77Q,bv2nCi5Qv5vroFiqKGopiw,ACFtxLv8pGrrxMm6EgjreA,4,2016-05-28,Love coming here. Yes the place always needs t...,0,0,0
4,L_9BTb55X0GDtThi6GlZ6w,bv2nCi5Qv5vroFiqKGopiw,s2I_Ni76bjJNK9yG60iD-Q,4,2016-05-28,Had their chocolate almond croissant and it wa...,0,0,0


#### Selecting a balanced dataset.

In [3]:
data_5 = data[data.stars.isin([5])]
data_5 = data_5.head(20000)

In [4]:
data_4 = data[data.stars.isin([4])]
data_4 = data_4.head(20000)

In [5]:
data_3 = data[data.stars.isin([3])]
data_3 = data_3.head(20000)

In [6]:
data_2 = data[data.stars.isin([2])]
data_2 = data_2.head(20000)

In [7]:
data_1 = data[data.stars.isin([1])]
data_1 = data_1.head(20000)

In [8]:
frames = [data_5, data_4, data_3, data_2, data_1]
final_df = pd.concat(frames)

#### Selecting the required columns.

In [9]:
data_df = final_df[['stars', 'text']]
print("Data Shape:", data_df.shape)
data_df.head()

Data Shape: (100000, 2)


Unnamed: 0,stars,text
0,5,Super simple place but amazing nonetheless. It...
1,5,Small unassuming place that changes their menu...
2,5,Lester's is located in a beautiful neighborhoo...
5,5,Cycle Pub Las Vegas was a blast! Got a groupon...
9,5,Love this place!\n\nPeggy is great with dogs a...


#### Applying Text Processing.

In [10]:
def clean_text(txt):
    
    # Compile Regex Information.
    # URL Regex.
    url_reg = re.compile(r'https?://(www.)?\w+\.\w+(/\w+)*/?')
    
    # Regex to treat "@mentions".
    mention_reg = re.compile(r'@(\w+)')

    # Remove hyperlinks.
    txt = url_reg.sub(' ', txt)
    
    # Remove text containing "@mentions".
    txt = mention_reg.sub(' ', txt)

    # Removing punctuations.
    txt = re.sub('[^a-zA-Z]', ' ', txt)

    # Convert to lower case.
    txt = txt.lower()

    # Remove tags.
    txt = re.sub("&lt;/?.*?&gt;", " &lt;&gt; ", txt)

    # Remove special characters and digits.
    txt = re.sub("(\\d|\\W)+", " ", txt)

    # Convert to list from string by splitting on "space" character.
    txt = txt.split(" ")

    # Remove empty strings.
    txt = [wrd for wrd in txt if wrd != ""]
    
    # Form sentences from words.
    txt = " ".join(txt)

    return txt

# Applying Text Processing to data.
data_df["Cleaned_Text"] = data_df["text"].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [11]:
data_file = data_df["Cleaned_Text"].values.tolist()

In [12]:
# Storing Training Data File.
file = open('training_file.txt','w')
for ele in data_file:
    try:
        file.write(str(ele) + '\n')
    except:
        pass
file.close()

In [13]:
data_label = data_df["stars"].values.tolist()

In [14]:
# Storing Training Label File.
file = open('training_label_file.txt','w')
for ele in data_label:
    try:
        file.write(str(ele) + '\n')
    except:
        pass
file.close()

#### Loading the stored training data.

In [15]:
# Loading training data.
training_file = open("training_file.txt","r") 
data_train = []
for line in training_file.readlines():
    data_train.append(line)

# Loading labels.
training_labl_file = open("training_label_file.txt","r") 
data_label = []
for line in training_labl_file.readlines():
    data_label.append(int(line.strip("\n")))

# Creating dataframe.
data_df = pd.DataFrame({"text":data_train,
                        "stars":data_label})
print("Training Data Shape:", data_df.shape)
data_df.head()

Training Data Shape: (100000, 2)


Unnamed: 0,text,stars
0,super simple place but amazing nonetheless it ...,5
1,small unassuming place that changes their menu...,5
2,lester s is located in a beautiful neighborhoo...,5
3,cycle pub las vegas was a blast got a groupon ...,5
4,love this place peggy is great with dogs and d...,5


#### Filtering Stopwords.

In [16]:
def filter_stopwords(txt):
   
    # Stop Words list from "NLTK" library.
    stop_wrds = ["out", "we", "was", "how", "myself", "for", "they", "about", "hasn't",
                 "then", "both", "so", "re", "don", "m", "as", "any", "mightn", "after",
                 "you", "wouldn", "why", "been", "where", "by", "isn't",
                 "yourself", "wasn", "a", "haven't", "did", "hadn't", "their", "hasn",
                 "doing", "be", "further", "ours", "now", "am", "her", "you'll",
                 "yourselves", "that", "my", "what", "to", "d", "not", "won't", "couldn't",
                 "own", "there", "this", "each", "all", "haven", "more", "me", "ve", "weren",
                 "which", "himself", "nor", "other", "shouldn't", "who", "should've", "same",
                 "at", "such", "t", "up", "than", "can", "you've", "too", "these", "while",
                 "wasn't", "ourselves", "before", "i", "he", "didn't", "our", "its", "but", "with",
                 "wouldn't", "those", "because", "the", "y", "shouldn", "it", "mustn", "hers", "just",
                 "doesn", "ain", "between", "over", "had", "aren", "mightn't", "does", "have", "and", 
                 "or", "some", "mustn't", "only", "won", "when", "needn", "below", "in", "if",
                 "theirs", "needn't", "aren't", "isn", "again", "his", "whom", "ll", "hadn",
                 "above", "should", "itself", "themselves", "until", "are", "she", "no", "from",
                 "into", "will", "your", "few","here", "is", "s", "don't", "shan't", "during", "she's",
                 "herself", "of", "has", "down", "were", "once", "ma", "having", "them", "under", "him",
                 "shan", "couldn", "do", "on", "an", "you\'d", "yours", "being", "off", "o", "that'll",
                 "very", "weren't", "didn", "through", "you're", "most", "against", "it's", "doesn't"]

    # Convert to list from string by splitting on "space" character.
    txt = txt.split(" ")

    # Words describing relationships. 
    rl_wrds = ['guy','spokesman','chairman',"men's",'men','him',"he's",'his',
               'boy','boyfriend','boyfriends','boys','brother','brothers','dad',
               'dads','dude','father','fathers','fiance','gentleman','gentlemen',
               'god','grandfather','grandpa','grandson','groom','he','himself',
               'husband','husbands','king','male','man','mr','nephew','nephews',
               'priest','prince','son','sons','uncle','uncles','waiter','widower',
               'widowers', 'heroine','spokeswoman','chairwoman',"women's",'actress',
               "she's",'her','aunt','aunts','bride','daughter','daughters','female',
               'fiancee','girl','girlfriend','girlfriends','girls','goddess',
               'granddaughter','grandma','grandmother','herself','ladies','lady',
               'lady','mom','moms','mother','mothers','mrs','ms','niece','nieces',
               'priestess','princess','queens','she','sister','sisters','waitress',
               'widow','widows','wife','wives','woman', 'women']

    # Words representing utterances.
    utterance_wrds = ["um", "huh"]

    # Complete Stop Word List.
    stop_wrds += rl_wrds + utterance_wrds

    # Removing stop words.
    txt = [word for word in txt if word not in stop_wrds]

    # Form sentences from words.
    txt = " ".join(txt)

    return txt

# Removing stop words.
data_df["Filtered_Text"] = data_df["text"].apply(filter_stopwords)

#### Stemming words.

In [17]:
def stemmer(txt):
    
    ps = PorterStemmer()
    # Convert to list from string by splitting on "space" character.
    txt = txt.split(" ")
    stemmed = []
    for ele in txt:
        stemmed += [ps.stem(ele)]

    # Form sentences from words.
    txt = " ".join(stemmed)
    return txt

# Apply stemming to identify root words.
data_df["Stemmed_Text"] = data_df["Filtered_Text"].apply(stemmer)

#### Computing Sentiment Score.

In [18]:
def sentiment_scr(txt):
    
    scr = TextBlob(txt).sentiment.polarity

    return scr

# Get Sentiment Score.
data_df["Sentiment_Score"] = data_df["Stemmed_Text"].apply(sentiment_scr)

#### Get Weighted Score using Rating.

In [19]:
data_df["Score"] = data_df["Sentiment_Score"] * data_df["stars"]

#### Labeling the data.

In [20]:
def label_threshold(scr):
    if (scr >= 0.5):
        label = 1
    else:
        label = 0
    return label

# Labeling based on threshold.
data_df["Label"] = data_df["Score"].apply(label_threshold)

#### Preparing data for ML Pipeline.

In [21]:
ml_df = data_df[["Stemmed_Text", "Label"]]
split = np.random.rand(len(ml_df)) < 0.8
train = ml_df[split]
test = ml_df[~split]
clean_train_corpus = train["Stemmed_Text"].values.tolist()
clean_test_corpus = test["Stemmed_Text"].values.tolist()
train_label = train["Label"].values.tolist()
test_label = test["Label"].values.tolist()

#### Applying Vectorizer and Classifier.

In [22]:
# Creating Vectorizer.
cv = CountVectorizer()
cv.fit(clean_train_corpus)

# Storing Vectorizer.
pickle.dump(cv, open("sentiment_analyzer_count_vector.pickle", "wb"))

# Fit transform with the Training data.
train_vctr = cv.fit_transform(clean_train_corpus)
print("Training Data", train_vctr.shape)

# Transform the testing data.
test_vctr = cv.transform(clean_test_corpus)
print("Testing Data", test_vctr.shape)

# Train Classifier.
clf = GradientBoostingClassifier(learning_rate=0.01, n_estimators=1000, max_depth=24,
                                 min_samples_split=3, min_samples_leaf=3,
                                 max_features='sqrt', random_state=42)
# Train all Classifier on the training Data.
clf.fit(train_vctr, train_label)

# Storing model.
pickle.dump(clf, open("sentiment_analyzer_gbm.pickle", 'wb'))


# Predict Test Data.
predictions = clf.predict(test_vctr)

# Accuracy.
print("Classifier Accuracy", accuracy_score(predictions, test_label)*100, "%")

Training Data (80315, 60287)
Testing Data (19685, 60287)
Classifier Accuracy 86.30429260858521 %


In [23]:
print("\nTotal Runtime:", int(time.time()-strt_time), "seconds.")


Total Runtime: 755 seconds.
