# Re-sampling Imbalanced Training Corpus

https://medium.com/analytics-vidhya/re-sampling-imbalanced-training-corpus-for-sentiment-analysis-c9dc97f9eae1

## Import Required Packages

In [25]:
import pandas as pd
import numpy as np
import sklearn as sk

from sklearn import model_selection, preprocessing, metrics, linear_model, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from imblearn.over_sampling import BorderlineSMOTE, SMOTE, ADASYN, SMOTENC, RandomOverSampler
from imblearn.under_sampling import (RandomUnderSampler, 
                                    NearMiss, 
                                    InstanceHardnessThreshold,
                                    CondensedNearestNeighbour,
                                    EditedNearestNeighbours,
                                    RepeatedEditedNearestNeighbours,
                                    AllKNN,
                                    NeighbourhoodCleaningRule,
                                    OneSidedSelection,
                                    TomekLinks)
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.pipeline import make_pipeline
import pandas as pd, numpy, string
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
#Remove Special Charactors
import re
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
from imblearn.over_sampling import RandomOverSampler

## Load Training Data

In [5]:
train = pd.read_csv("./Data/train_data.csv")
train = train.set_index("Index")
train.head()

  train = pd.read_csv("./Data/train_data.csv")


Unnamed: 0_level_0,Venue Index,Name,Date,Rating,Rating Sent,Text,Aspect,Accessibility,Term1,Term2,Term3
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
18594,10,coffeeshop la grotte,2019,5 stars,positive,2nd year coming here and can t say enough good...,0,0,,,
373579,451,pata negra,2019,5 stars,positive,excellent,0,0,,,
128693,112,bagels & beans ijdock,2021,5 stars,positive,"location, very nice and welcoming, with a spl...",0,0,,,
120681,17,plum,2020,5 stars,positive,went to this place on saturday. i had the gril...,0,0,,,
162003,504,caf‚àö¬© thijssen,2021,5 stars,positive,was wer very tasty,0,0,,,


In [8]:
# SPLITTING DATA

# Split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train['Text'],train['Accessibility'])

# Label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [10]:
# DATA DISTRIBUTION

# Percentage of Positive/Negative
print("1: ", train.Accessibility.value_counts()[0]/len(train)*100,"%")
print("0: ", train.Accessibility.value_counts()[1]/len(train)*100,"%")

1:  99.52676918166836 %
0:  0.47323081833164127 %


## TF-IDF

In [12]:
# Word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=100000)
tfidf_vect.fit(train['Text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

## Evaluation

In [16]:
# Return the f1 Score
def train_model(classifier, feature_vector_train, label, feature_vector_valid):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    return metrics.f1_score(valid_y,predictions)

## Classifier

In [17]:
# ACCURARY

accuracyORIGINAL = train_model(linear_model.LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial'),xtrain_tfidf, train_y, xvalid_tfidf)
print ("LR Baseline, WordLevel TFIDF: ", accuracyORIGINAL)
accuracyORIGINAL = train_model(svm.LinearSVC(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("SVM Baseline, WordLevel TFIDF: ", accuracyORIGINAL)
#LR Baseline, WordLevel TFIDF:  0.7912087912087912
#SVM Baseline, WordLevel TFIDF:  0.9031446540880502

LR Baseline, WordLevel TFIDF:  0.7912087912087912
SVM Baseline, WordLevel TFIDF:  0.9031446540880502


## Re-Sampling

In [27]:
#Random Over Sampling
ros = RandomOverSampler(random_state=777)
ros_xtrain_tfidf, ros_train_y = ros.fit_resample(xtrain_tfidf, train_y)
accuracyROS = train_model(linear_model.LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial'),ros_xtrain_tfidf, ros_train_y, xvalid_tfidf)
print ("LR ORIGINAL, WordLevel TFIDF: ", accuracyROS)
accuracyROS = train_model(svm.LinearSVC(),ros_xtrain_tfidf, ros_train_y, xvalid_tfidf)
print ("SVM ROS, WordLevel TFIDF: ", accuracyROS)
#LR ROS, WordLevel TFIDF:  0.851063829787234
#SVM ROS, WordLevel TFIDF:  0.880093131548312

LR ORIGINAL, WordLevel TFIDF:  0.851063829787234
SVM ROS, WordLevel TFIDF:  0.880093131548312


In [30]:
#SMOTE
sm = SMOTE(random_state=777)#, ratio = 1.0)
sm_xtrain_tfidf, sm_train_y = sm.fit_resample(xtrain_tfidf, train_y)
accuracySMOTE = train_model(linear_model.LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial'),sm_xtrain_tfidf, sm_train_y, xvalid_tfidf)
print ("LR SMOTE, WordLevel TFIDF: ", accuracySMOTE)
accuracySMOTE = train_model(svm.LinearSVC(),sm_xtrain_tfidf, sm_train_y, xvalid_tfidf)
print ("SVC SMOTE, WordLevel TFIDF: ", accuracySMOTE)
#LR SMOTE, WordLevel TFIDF:  0.6848436246992782
#SVC SMOTE, WordLevel TFIDF:  0.693288020390824

LR SMOTE, WordLevel TFIDF:  0.8515709642470205
SVC SMOTE, WordLevel TFIDF:  0.8787878787878787
