In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import seaborn as sns
import string
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import absolute


In [2]:
%%time
unfiltered_data = pd.read_csv('car-reviews.csv', sep=',', dtype=object)

Wall time: 40 ms


In [3]:
unfiltered_data.isnull().sum() # checking for missing rows

Sentiment    0
Review       0
dtype: int64

### Part 1
The method below takes in a file path as an input ad returns a pandas' DataFrame with tokenized, stemmed, and no puncutation words. The stemmer used is PorterStemmer(), and regex was used to remove punctuation and only include words and space. Tokenization is done using nltk.word_tokenize()

In [4]:
%%time
def filtering_porter(csv_file):
    # This method takes a csv file path as input and returns a pandas dataframe of the filtered text
    # This method will remove english stop words, punctuation using regex, and perform stemming using PorterStemmer()
    
    df2 = pd.read_csv(csv_file, sep=',', dtype=object) # loading the data into pandas data frame
    stop_words2 = stopwords.words('english') # creating stop words 
    stem = PorterStemmer() # initiating stemmer
    
    for i, row in df2.iterrows(): # iterating through the pandas rows a
        filtered = []
        review = row['Review']
        review = re.sub(r'[^\w\s]', '', review) # removing punctuation using regex
        words = nltk.word_tokenize(review) # tokenizing the words
        words = [w for w in words if not w in stop_words2] # removing stop words
        
        for word in words:
            filtered.append(stem.stem(word)) # stemming and appending to an empty list

        df2.loc[i, 'Review'] = filtered # updating the Review column rows with the filtered reviews
    
    return df2

filtered_data_porter = filtering_porter('car-reviews.csv')

Wall time: 10.6 s


### Punctuation and unnecessary words
In the code below, it will iterate over all the words in the returned DataFrame from previous method, and if it finds a punctuation that is included in string.punctuation, it will print it out. 

I will also print the first few rows of unprocessed and processed data to show how unncecessary words that do npt affect sentiment are removed.

In [5]:
%%time
punctuation = string.punctuation.split()

#checking for punctuation
for row in filtered_data_porter.head().itertuples():
    for word in row:
        if word in punctuation:
            print(word)
# showing that unnecessary words are removed
print('\033[1m' + "As an example, I will print the first few reviews of unprocessed data:\n\n" + '\033[0m' + f"{unfiltered_data.head()}")
print("\n\n")
print('\033[1m' + "words such as 'we', 'a', 'it', 'to', 'of', 'whom' and more have been removed in the processed data as seen below:\n\n" + '\033[0m' + f"{filtered_data_porter.head()}")



[1mAs an example, I will print the first few reviews of unprocessed data:

[0m  Sentiment                                             Review
0       Neg   In 1992 we bought a new Taurus and we really ...
1       Neg   The last business trip  I drove to San Franci...
2       Neg   My husband and I purchased a 1990 Ford F250 a...
3       Neg   I feel I have a thorough opinion of this truc...
4       Neg   AS a mother of 3  all of whom are still in ca...



[1mwords such as 'we', 'a', 'it', 'to', 'of', 'whom' and more have been removed in the processed data as seen below:

[0m  Sentiment                                             Review
0       Neg  [In, 1992, bought, new, tauru, realli, love, S...
1       Neg  [the, last, busi, trip, I, drove, san, francis...
2       Neg  [My, husband, I, purchas, 1990, ford, f250, no...
3       Neg  [I, feel, I, thorough, opinion, truck, compar,...
4       Neg  [AS, mother, 3, still, carseat, logic, thing, ...
Wall time: 12 ms


### Stemming
In the cell below, I will show examples of words that have been stemmed. 

    1- expected, expecting
    2- explored, exploring
    3- dreamed, dreaming 
    
Kindly scroll down in the output to see more.

In [6]:
%%time
print('\033[1m' + "1- Frist stem to demonstrate is 'expect'\n\n >>>The review below has the variation word 'Expected'\n\n" + '\033[0m' + f"{unfiltered_data['Review'][19]}\n\n" + '\033[1m' + ">>>and the review below has the variation word 'Expecting'\n\n"  +'\033[0m' + f"{unfiltered_data['Review'][2]}")
print("\n\n")
print('\033[1m' + ">>>Both variaitons have been stemmed to the same stem 'expect' as seen in the tokenized reviews below accordingly\n\n" + '\033[0m' + f"{filtered_data_porter['Review'][19]}\n\n" + f"{filtered_data_porter['Review'][2]}")
print("\n\n")
print('\033[1m' + "2- Second stem to demonstrate is 'Explor'\n\n >>>The review below has the variation word 'Explored'\n\n" + '\033[0m' + f"{unfiltered_data['Review'][1254]}\n\n" + '\033[1m' + ">>>and the review below has the variation word 'Exploring'\n\n"  +'\033[0m' + f"{unfiltered_data['Review'][127]}")
print("\n\n")
print('\033[1m' + ">>>Both variaitons have been stemmed to the same stem 'explor' as seen in the tokenized reviews below accordingly\n\n" + '\033[0m' + f"{filtered_data_porter['Review'][1254]}\n\n" + f"{filtered_data_porter['Review'][127]}")
print("\n\n")
print('\033[1m' + "3- Third stem to demonstrate is 'dream'\n\n >>>The review below has the variation word 'dreamed'\n\n" + '\033[0m' + f"{unfiltered_data['Review'][6]}\n\n" + '\033[1m' + ">>>and the review below has the variation word 'dreaming'\n\n"  +'\033[0m' + f"{unfiltered_data['Review'][76]}")
print("\n\n")
print('\033[1m' + ">>>Both variaitons have been stemmed to the same stem 'dream' as seen in the tokenized reviews below accordingly\n\n" + '\033[0m' + f"{filtered_data_porter['Review'][6]}\n\n" + f"{filtered_data_porter['Review'][76]}")

[1m1- Frist stem to demonstrate is 'expect'

 >>>The review below has the variation word 'Expected'

[0m Our big search for a larger vehicle started when I found out I was expecting another child  At the time  I was driving an old Volkswagon Bug and I had two children already  We looked at quite a few brands  The only one I didnt feel comfortable with was Fords  I had heard a lot of negative feedback on those cars  When we saw the Aerostar on the lot  my husband wanted it right then  It was clean and comfortable with a sporty look  I wanted to look a little more  I researched it online to make sure we were getting a good deal  We had looked at  drove  and researched over a dozen before thinking about narrowing down our options  When the dealer lowered his price to what I felt was an awesome deal  we decided to buy it  For not wanting a Ford to begin with  I would have to say that this vehicle has been one of the best we have owned  It is very dependable and comfortable  The air condi

### Vectorizing reviews, Case Sensitivity, Naive Bayes Model, Dealing with Unseen Data, and Confusion Matrix
The method below will implement the CountVectorizer() method to turn words into vectors and count number of appearance in each review. Afterwards, it splits the data into train and test data using train_test_split() with 80-20 split. The classifier does not take test data to train on, as the method fit() only used with x_train_cv and y_train. The test data only undergone transform() and predict() methods.

To handle case sensitivity, it applies str.lower() on all rows of Review column. 

It also includes the Multinomial Naive Bayes classifier from sklearn library, which fits and trains on the training data. Eventually, the code predicts the sentiments using .predict() method of the Multinomial Naive Bayes and returns it as the variable pred. To handle unseen data in the test data, I applied Laplace Smoothing which adds a value of 1 to all the words in the matrix by making alpha=1 in the Naive Bayes instanse.

In [7]:
%%time
def counter_vec_NB(df3):
    
    # This method takes a pandas data frame as input and returns pred for predicted values
    # The method will first jon tokenized words and reaasign Pos and Ned sentiments to 1 and 0
    # It will apply K-cross validation to check accuracy and error
    # It will convert the features to vectors using CounVectorizer() and then split the data into training and testing data
    # It will then initial MultiNomaial Naive Bayes classifier to predict the positive and negative reviews
    # At the end it will display the confusion matrix along with classification report
    
    df = df3
    df["Review"] = df.Review.map(lambda x: ' '.join(x)) # joining the tokenied words into a full sentence on each row
    df.loc[df["Sentiment"] == 'Pos', "Sentiment"] = 1 # replacing positive sentiments with 1
    df.loc[df["Sentiment"] == 'Neg', "Sentiment"] = 0 # replacing negative sentiments with 0

    df["Review"] = df["Review"].str.lower() # making sure all words and letters are lower case
    
    print('\033[1m' + "Review column examples after making all words lower case:\n\n" + '\033[0m' + f"{df['Review']}\n\n\n\n")
  
    x = df["Review"] # assigning reviews column to variable x
    y = df["Sentiment"] # assigning Sentiment column to variable y
    
    cross_validation = KFold(n_splits=10, shuffle=True, random_state=1) # initiating K-cross validation

    cv = CountVectorizer()
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4) # splitting the data 80-20
    x_train_cv = cv.fit_transform(x_train) # fitting and transforming the training data into vectors 
    a = x_train_cv.toarray()
    counts = pd.DataFrame(a, columns=cv.get_feature_names())
    print('\033[1m' + "Below is the words converted to vectorized form using CounterVectorizer\n\n" + '\033[0m' + f"{counts}\n\n")
    print("\n\n")

    MN_nb = MultinomialNB(alpha=1)
    y_train = y_train.astype('int')
    scores = cross_val_score(MN_nb, x_train_cv, y_train, scoring='neg_mean_absolute_error',
                             cv=cross_validation, n_jobs=-1)
    print('\033[1m' + "RMSE is:\n\n" + '\033[0m' + f"{mean(absolute(scores))}\n\n")

    MN_nb.fit(x_train_cv, y_train)
    x_test_cv = cv.transform(x_test)
    y_test2 = y_test.astype('int')
    pred = MN_nb.predict(x_test_cv)
    actual = np.array(y_test)
    count = 0
    for i in range(len(pred)): # calculating accuracy
        if pred[i] == actual[i]:
            count += 1
    print('\033[1m' + "Accuracy of the model:\n\n" + '\033[0m' + f"{count/len(pred)}\n\n")
    confusion_m = confusion_matrix(y_test2, pred)
    print('\033[1m' + f"confusion matrix: \n\n" + '\033[0m' + f"{confusion_m}\n\n")    
    classification_r = classification_report(y_test2, pred)
    print("\n\n" + '\033[1m' + "Classification report: \n\n" + '\033[0m' + f"{classification_r}")
    return pred

output1 = counter_vec_NB(filtered_data_porter)

[1mReview column examples after making all words lower case:

[0m0       in 1992 bought new tauru realli love so 1999 d...
1       the last busi trip i drove san francisco i wen...
2       my husband i purchas 1990 ford f250 noth probl...
3       i feel i thorough opinion truck compar post ev...
4       as mother 3 still carseat logic thing trade 20...
                              ...                        
1377    in june bought soni limit edit focu se 4 dr se...
1378    after 140 000 mile decid replac wife 1990 toyo...
1379    the ford focu great littl record set car it fi...
1380    i need new car hyundai excel 91 i decid shop a...
1381    the 2000 ford focu se 4 door sedan spaciou int...
Name: Review, Length: 1382, dtype: object




[1mBelow is the words converted to vectorized form using CounterVectorizer

[0m      00  000  000k  000km  00a  00p  01  0110  02  03  ...  zoo  zoom  zt  \
0      0    0     0      0    0    0   0     0   0   0  ...    0     0   0   
1      0    

### Part 2

__IMPORTANT:__

__In order to run the cell below kindly comment the last line of the previous cell "output1 = counter_vec_NB(filtered_data_porter)" and uncomment the last line of the below cell "output2 = improved_counter_vec_NB(filtered_data_porter)"__  _I have tried to run them all at the same time but an error arises which I could not fix_

To improve the classifier, I have tried adjusting the Hyperparameters of CountVectorier() and choosing binary over number of appearance times, min_df=1, stop_Words='english', ngram_range=(1,2). I have kept the random state of train_test_split() to be 4 for Part 1 and Part 2 in order for fair comparison and making sure I am able to make an actual improvement and the lower error is not due to the different random states chosen by the funstion itself "apple to apple". 

However, there are more advanced ways to improve the classifier that I have not implemented. On one article in Stackoverflow, it states that Synonym finding, neutral words, feature selection, and The Fisher Method shouhld have a postivie impact on the accuracy of the model. ("Ways to improve the accuracy of a Naive Bayes Classifier?", 2010)

To have a true look at the performance, I have applied a K-cross validation. The root mean square of the error is displayed as RMSE. The lower the value the better the accuracy.

In this part, I will not print out examples of stemmed words, lower case sentences, etc. in order to focus on the confusion matrix and associated classification report.

In [8]:
def improved_counter_vec_NB(df4):
    
    # This method takes a pandas data frame as input and returns pred for predicted values
    # The method will first jon tokenized words and reaasign Pos and Ned sentiments to 1 and 0
    # It will apply K-cross validation to check accuracy and error
    # It will convert the features to vectors using CounVectorizer() and then split the data into training and testing data
    # It will then initial MultiNomaial Naive Bayes classifier to predict the positive and negative reviews
    # At the end it will display the confusion matrix along with classification report
    
    df = df4
    df["Review"] = df.Review.map(lambda j: ' '.join(j)) # joining the tokenied words into a full sentence on each row
    df.loc[df["Sentiment"] == 'Pos', "Sentiment"] = 1 # replacing positive sentiments with 1
    df.loc[df["Sentiment"] == 'Neg', "Sentiment"] = 0 # replacing negative sentiments with 0
    df["Review"] = df["Review"].str.lower()  # making sure all words and letters are lower case
    x = df["Review"] # assigning reviews column to variable x
    y = df["Sentiment"] # assigning sentiment column to variable y
    
    cross_validation = KFold(n_splits=10, shuffle=True, random_state=1) # initiating K-cross validation

    cv = CountVectorizer(binary=True, strip_accents='ascii', min_df=1, stop_words='english', ngram_range=(1, 2))
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4) # splitting the data 80-20
    x_train_cv = cv.fit_transform(x_train) # fitting and transforming the training data into vectors 
    a = x_train_cv.toarray()
    
    counts = pd.DataFrame(a, columns=cv.get_feature_names())
    
    MN_nb = MultinomialNB(alpha=1)
    y_train = y_train.astype('int')
    
    scores = cross_val_score(MN_nb, x_train_cv, y_train, scoring='neg_mean_absolute_error',
                             cv=cross_validation, n_jobs=-1)
    print('\033[1m' + "RMSE is:\n\n" + '\033[0m' + f"{mean(absolute(scores))}\n\n")
    
    MN_nb.fit(x_train_cv, y_train)
    x_test_cv = cv.transform(x_test)
    y_test2 = y_test.astype('int')
    pred = MN_nb.predict(x_test_cv)
    actual = np.array(y_test)
    count = 0
    for i in range(len(pred)):
        if pred[i] == actual[i]:
            count += 1
    print('\033[1m' + "Accuracy of the model:\n\n" + '\033[0m' + f"{count/len(pred)}\n\n")
    confusion_m = confusion_matrix(y_test2, pred)
    print('\033[1m' + f"confusion matrix: \n\n" + '\033[0m' + f"{confusion_m}\n\n")    
    classification_r = classification_report(y_test2, pred)
    print("\n\n" + '\033[1m' + "Classification report: \n\n" + '\033[0m' + f"{classification_r}")
    return pred

# output2 = improved_counter_vec_NB(filtered_data_porter)

### References

1- Stack Overflow. 2010. Ways to improve the accuracy of a Naive Bayes Classifier?. [online] Available at: <https://stackoverflow.com/questions/3473612/ways-to-improve-the-accuracy-of-a-naive-bayes-classifier> [Accessed 14 April 2021].

2- Statology. 2020. K-Fold Cross Validation in Python (Step-by-Step). [online] Available at: <https://www.statology.org/k-fold-cross-validation-in-python/> [Accessed 14 April 2021].

3- Scikit-learn.org. n.d. sklearn.feature_extraction.text.CountVectorizer — scikit-learn 0.24.1 documentation. [online] Available at: <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html> [Accessed 14 April 2021].

4- Stack Overflow. 2019. What is the difference between CountVectorizer(binary=True) and n CountVectorizer(binary=False) in sklearn. [online] Available at: <https://stackoverflow.com/questions/56773265/what-is-the-difference-between-countvectorizerbinary-true-and-n-countvectorize> [Accessed 14 April 2021].

5- Python and R Tips. 2018. How To Loop Through Pandas Rows? or How To Iterate Over Pandas Rows? - Python and R Tips. [online] Available at: <https://cmdlinetips.com/2018/12/how-to-loop-through-pandas-rows-or-how-to-iterate-over-pandas-rows/> [Accessed 14 April 2021].

6- Youtube.com. 2019. Porter Stemmer in Python | Natural Language Processing with Python and NLTK. [online] Available at: <https://www.youtube.com/watch?v=SOzy8NmV8Qk&ab_channel=KnowledgeCenter> [Accessed 14 April 2021].

7- Stack Overflow. 2016. Python stemming (with pandas dataframe). [online] Available at: <https://stackoverflow.com/questions/37443138/python-stemming-with-pandas-dataframe> [Accessed 14 April 2021].

8- Stack Overflow. 2013. strip punctuation with regex - python. [online] Available at: <https://stackoverflow.com/questions/18429143/strip-punctuation-with-regex-python> [Accessed 14 April 2021].

9- Python Daddy. 2020. How to Remove Punctuation from a Dataframe in Pandas and Python - Python Daddy. [online] Available at: <https://www.pythondaddy.com/python/how-to-remove-punctuation-from-a-dataframe-in-pandas-and-python/> [Accessed 14 April 2021].

10- Upadhyay, P., 2020. Removing stop words with NLTK in Python - GeeksforGeeks. [online] GeeksforGeeks. Available at: <https://www.geeksforgeeks.org/removing-stop-words-nltk-python/> [Accessed 14 April 2021].

11- Youtube.com. 2017. Machine Learning with Text - TFIDF Vectorizer MultinomialNB Sklearn (Spam Filtering example Part 2). [online] Available at: <https://www.youtube.com/watch?v=bPYJi1E9xeM&ab_channel=TheSemicolon> [Accessed 14 April 2021].