In [366]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import re
import xgboost as xgb
import scikitplot as skplt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
import sklearn.metrics
from joblib import dump, load
from scipy.sparse import save_npz, load_npz
import nltk
import contractions
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.corpus import brown
from nltk.corpus import genesis
from nltk.stem.porter import PorterStemmer
from nltk.probability import ConditionalFreqDist
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.naive_bayes import MultinomialNB

datasets = {'test' : './test/testing.csv', 'train' : './train/training.csv'}


In [109]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('genesis')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jfear\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jfear\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jfear\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jfear\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package genesis to
[nltk_data]     C:\Users\jfear\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\genesis.zip.


True

### Processing Dataset

In [371]:

def PlaceInCsvs():
    directories = ['./test/', './train/']
    subdirectories = ['neg', 'pos']

    for directory in directories:
        ratings = []
        reviews = []
        sentiments = []
        for subdirectory in subdirectories:
            fullDirectory = f'{directory}{subdirectory}'
            for file in os.listdir(fullDirectory):
                rating = int((file.split('_', 1)[1])[0])
                if 'neg' in fullDirectory and rating >= 5:
                    rating = pd.NA
                elif 'pos' in fullDirectory and rating <= 4:
                    rating = pd.NA
                
                
                if 'pos' in fullDirectory:
                    sentiments.append(1)
                else: 
                    sentiments.append(0)
                    
                reviewFile = open(f'{fullDirectory}/{file}', 'r', encoding='utf-8')
                review = reviewFile.read()
                ratings.append(rating)
                reviews.append(review)
        df = pd.DataFrame()
        df['rating'] = ratings
        df['review'] = reviews
        df['sentiment'] = sentiments
        df['rating'] = df['rating'].astype({'rating':'Int64'}, copy=False)
        fileNames = {f'{directories[0]}' : 'testing', f'{directories[1]}' : 'training'}
        df.to_csv(f'{directory}{fileNames[directory]}.csv')
        
def RemoveHTML(dataframe, fileName):
    regex = re.compile('<.*?>')
    for review in dataframe['review']:
        cleanReview = re.sub(regex, '', review)
        dataframe.loc[dataframe.review == review, 'review'] = cleanReview 
    dataframe.to_csv(fileName, index=False)
    
def AddTitleIds(dataframe):
    directories = ['./test/', './train/']
    subdirectories = ['neg', 'pos']
    regex = re.compile('<.*?>')
    

    for directory in directories:
        for subdirectory in subdirectories:
            urlFile = open(f'{directory}/urls_{subdirectory}.txt', 'r', encoding='utf-8')
            fullDirectory = f'{directory}{subdirectory}'
            for file in os.listdir(fullDirectory):
                reviewFile = open(f'{fullDirectory}/{file}', 'r', encoding='utf-8')
                reviewText = re.sub(regex, '', reviewFile.read())
                dataframe.loc[dataframe.reviews == reviewText, 'reviews'].iloc[0]['reviews']
                
def AddPositiveNegativeLabel(filePath):
    dataframe = pd.read_csv(filePath, index_col='index')
    dataframe['sentiment'] = 0
    for row in dataframe.iterrows():
        if dataframe.at[row[0], 'ratings'] <= 4:
            dataframe.at[row[0], 'sentiment'] = 0
        else:
            dataframe.at[row[0], 'sentiment'] = 1
    dataframe.to_csv(filePath)
    
def CreateNGrams(data):
    dataframe = pd.read_csv('./train/training_new.csv')
    vectorizer = CountVectorizer(lowercase=True, ngram_range=(1,1), min_df=5, max_df=0.5, binary=False, strip_accents='unicode')
    #transformer = TfidfTransformer(use_idf=True, smooth_idf=True, sublinear_tf=True)
    
    vectorizer.fit(dataframe['review'].values.astype('U'))
    vector = vectorizer.transform(data)
    vector.toarray()
    #transformer.fit(vectorizer.transform(dataframe['review'].values.astype('U')))
    #transformer.transform(vectorizer.transform(data))
    return vector
    
    
def CleanTextFurther():
    csvs = ['./test/testing', './train/training']
    stopWords = stopwords.words('english')   
    genesisIc = wordnet.ic(genesis, False, 0.0)
    #stopWords.remove('very')
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    for path in csvs:
        dataframe = pd.read_csv(f'{path}.csv') 
        dataframe.drop(dataframe.loc[dataframe['rating'].isna()].index, inplace=True)
        reviews = list(dataframe['review'].values)
        for review in reviews:
            reviewIndex = reviews.index(review)
            review = contractions.fix(review)
            review = re.sub(r'[^\w\s]', ' ', review)
            words = word_tokenize(review)
            words = [word for word in words if word not in set(stopWords)]
            wordTags = nltk.pos_tag(words)
            
            # Remove Numbers and Proper Nouns
            for word, tag in wordTags:
                if word.isnumeric():
                    review = review.replace(f' {word} ', ' ')
                if tag == 'NNP':
                    review = review.replace(f' {word} ', ' ')
                # if tag != 'NN' and tag != 'RP' and tag != 'JJR' and tag != 'JJS' and tag != 'RB' and tag != 'RBR' and tag != 'RBS':
                #     review = review.replace(f' {word} ', ' ')
            words = word_tokenize(review.lower())
            review = ''
            for word in words:
                #word = lemmatizer.lemmatize(word)
                # for lemma in wordnet.synsets(word, pos=['a']):
                #     if lemma.pos() != 'a':
                #         continue
                #     if lemma.res_similarity(wordnet.synset('good.a.01'), genesisIc) >= 6.0:
                #         word = lemmatizer.lemmatize('good')
                #     if lemma.res_similarity(wordnet.synset('bad.a.01'), genesisIc) >= 6.0:
                #         word = lemmatizer.lemmatize('bad')
                word = stemmer.stem(word)
                review = review + f'{word} '
            review = ' '.join([word for word in words if word not in set(stopWords)])
            # Remove Spaces
            while review.count('  ') >= 1:
                review = review.replace('  ', ' ')
            reviews[reviewIndex] = review
        dataframe['review'] = reviews
        dataframe.drop(dataframe.loc[dataframe['review'].isna()].index, inplace=True)
        dataframe.to_csv(f'{path}_new.csv', index=False)
                 
    
def TrainModel(model):
    dataframe = pd.read_csv('./train/training_new.csv')
    
    bigrams = CreateNGrams(dataframe['review'].values.astype('U'))
    
    xTrain, xValidate, yTrain, yValidate = train_test_split(bigrams, dataframe['sentiment'].values, train_size=0.75, stratify=dataframe['sentiment'])

    # Testing both types of models

    if model == 'xgb':
        classifier = xgb.XGBClassifier(n_estimators=10000, max_depth=4, learning_rate=0.2, tree_method='gpu_hist', early_stopping_rounds=50)
        classifier.fit(xTrain, yTrain, eval_set=[(xTrain, yTrain), (xValidate, yValidate)], verbose=100)
        classifier.save_model('xgb_model.json')
    elif model == 'mnb':
        classifier = MultinomialNB(fit_prior=False, alpha=0.25)
        classifier.fit(bigrams, dataframe['sentiment'].values)
        dump(classifier, 'mnb_model.sav')
    
    
def TestModel(model):
    #CreateNGrams('./test/testing.csv', 'test_bigrams', 'test_freq')
    testDataframe = pd.read_csv('./test/testing_new.csv')
    
    bigrams = CreateNGrams(testDataframe['review'].values.astype('U'))
    
    #matrix = xgb.DMatrix(bigrams, testDataframe['sentiment'].values)
    
    prediction = None
    
    if model == 'xgb':
        classifier = xgb.XGBClassifier(n_estimators=10000, max_depth=4, learning_rate=0.1, tree_method='gpu_hist', early_stopping_rounds=50)    
        classifier.load_model('xgb_model.json')
        prediction = classifier.predict(bigrams)
    elif model == 'mnb':
        classifier = load('mnb_model.sav')
        prediction = classifier.predict(bigrams)
    
    
    #display(prediction)
    #display(testDataframe['sentiment'].values)
    
    print('F1-Score: ')
    print(sklearn.metrics.f1_score(testDataframe['sentiment'].values, prediction))
    
    
    
    
    
    
    
            

In [372]:
#CleanTextFurther()
TrainModel('xgb')
TestModel('xgb')

[0]	validation_0-logloss:0.65714	validation_1-logloss:0.65771
[100]	validation_0-logloss:0.32825	validation_1-logloss:0.40233
[200]	validation_0-logloss:0.25438	validation_1-logloss:0.36883
[300]	validation_0-logloss:0.21233	validation_1-logloss:0.35515
[400]	validation_0-logloss:0.18242	validation_1-logloss:0.34801
[500]	validation_0-logloss:0.15946	validation_1-logloss:0.34179
[600]	validation_0-logloss:0.14078	validation_1-logloss:0.33979
[700]	validation_0-logloss:0.12666	validation_1-logloss:0.33890
[800]	validation_0-logloss:0.11500	validation_1-logloss:0.33763
[832]	validation_0-logloss:0.11117	validation_1-logloss:0.33792
F1-Score: 
0.8081110506260206
