In [4]:
import pickle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics


In [5]:
def load_file(filepath):
    """
    This function is used to load a file from the specified file path
    This was used to load the mapping dictionaries for this script
    Parameters
    ----------
    filepath: str

    Returns
    Any file
    -------

    """

    with open(filepath, 'rb') as f:
        file = pickle.load(f)
        return file
    
def save_file(filepath, data):
    """
    This function is used to save picklfiles
    Args:
        filepath: This is the location where it will be saved
        data: This is the data that you want to save
    Returns:
        None
    """
    pickle.dump(data, open(filepath, "wb"))

In [6]:
imdb_data = load_file("C:\\Users\\Ukachi\\PycharmProjects\\paper_codes\\files\\imdb_data.pkl")

In [14]:
traditional_twitter = load_file(r"C:\Users\Ukachi\PycharmProjects\paper_codes\files\Airline_Twitter_traditional_classifier_results.pkl")
traditional_twitter

Unnamed: 0,Classifiers,F1 Score
0,Logistic Regression,0.708738
1,KNN,0.593486
2,SVM_l2,0.791667
3,SGD_l2,0.791574
4,SVM_l1,0.777416
5,SGD_l1,0.762955
6,MultinomialNB,0.71932
7,BernoulliNB,0.744479
8,ComplementNB,0.723153
9,DecisionTree,0.6625


In [15]:
airline_twitter_w2v_results = load_file(r"C:\Users\Ukachi\PycharmProjects\paper_codes\files\Twitter_Logistic_Regression_Word2Vec_results.pkl")

In [16]:
airline_twitter_w2v_results

{'Twitter_Logistic_Regression_glove_small': 0.631055900621118,
 'Twitter_Logistic_Regression_glove_small_tfidf': 0.5964912280701754,
 'Twitter_Logistic_Regression_glove_big': 0.7437070938215103,
 'Twitter_Logistic_Regression_glove_big_tfidf': 0.734090909090909,
 'Twitter_Logistic_Regression_w2v': 0.5301837270341208,
 'Twitter_Logistic_Regression_w2v_tfidf': 0.6127450980392157}

In [20]:
airline_twitter_w2v_results_2 = load_file(r"C:\Users\Ukachi\PycharmProjects\paper_codes\files\Airline_Twitter_Logistic_Regression_Word2Vec_results.pkl")
airline_twitter_w2v_results_2

{'Airline_Twitter_Logistic_Regression_glove_small': 0.7161997563946407,
 'Airline_Twitter_Logistic_Regression_glove_small_tfidf': 0.7146341463414635,
 'Airline_Twitter_Logistic_Regression_glove_big': 0.5809768637532133,
 'Airline_Twitter_Logistic_Regression_glove_big_tfidf': 0.568758344459279,
 'Airline_Twitter_Logistic_Regression_w2v': 0.55359565807327,
 'Airline_Twitter_Logistic_Regression_w2v_tfidf': 0.6360248447204969}

In [26]:
# Building a Pipeline
lr_clf = Pipeline([
    ('tfidf_vec', TfidfVectorizer(sublinear_tf=True, stop_words="english")),
    ('clf', LogisticRegression(max_iter=10000, tol=0.1)), 
])

In [27]:
X = imdb_data.review
y = imdb_data.sentiment

In [28]:
data = np.array(X)
target = np.array(y)

In [29]:
def split_data(data, label, percentage):
    X_train, X_test, y_train, y_test = train_test_split(data, label, test_size = percentage)
    return X_train, X_test, y_train, y_test


In [30]:
train_data, test_data, train_label, test_label = split_data(data, target, 0.2)

In [31]:
lr_clf.fit(train_data, train_label)



Pipeline(memory=None,
     steps=[('tfidf_vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=T...e, penalty='l2', random_state=None, solver='warn',
          tol=0.1, verbose=0, warm_start=False))])

In [36]:
# Evaluation
predicted = lr_clf.predict(test_data)
print(metrics.classification_report(test_label, predicted, target_names=["Negative", "Positive"]))

              precision    recall  f1-score   support

    Negative       0.91      0.88      0.90      5020
    Positive       0.89      0.91      0.90      4980

   micro avg       0.90      0.90      0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [43]:
score = metrics.accuracy_score(test_label, predicted)
f1_score = metrics.f1_score(test_label, predicted)
print(score)
print(f1_score)

0.8988
0.8999406762902907


In [47]:
# Grid Search
from pprint import pprint
from time import time
import logging

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline


In [45]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

In [46]:
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__max_iter': (20,),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    'clf__max_iter': (10, 50, 80),
}


In [None]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(train_data, train_label)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

### The other datasets

In [5]:
amazon_data = load_file("C:\\Users\\Ukachi\\PycharmProjects\\paper_codes\\files\\amazon_data.pkl")

In [8]:
amazon_data.head()

Unnamed: 0,reviews.rating,reviews.text,sentiment,review
0,3,I order 3 of them and one of the item is bad q...,0,I order 3 one item bad quality . Is missing ba...
1,4,Bulk is always the less expensive way to go fo...,0,Bulk always less expensive way go products like
2,5,Well they are not Duracell but for the price i...,1,Well Duracell price happy .
3,5,Seem to work as well as name brand batteries a...,1,Seem work well name brand batteries much bette...
4,5,These batteries are very long lasting the pric...,1,These batteries long lasting price great .


In [7]:
amazon_data.rename(columns={"processed": "review", "class":"sentiment"},inplace=True)

In [10]:
save_file("C:\\Users\\Ukachi\\PycharmProjects\\paper_codes\\files\\amazon_data.pkl", amazon_data)

In [11]:
twitter_data = load_file("C:\\Users\\Ukachi\\PycharmProjects\\paper_codes\\files\\twitter_data.pkl")

twitter_data.head()

# twitter_data.rename(columns={"processed": "review", "class":"sentiment"},inplace=True)

# save_file("C:\\Users\\Ukachi\\PycharmProjects\\paper_codes\\files\\twitter_data.pkl", amazon_data)



Unnamed: 0,clean_text,class
0,pope john paul ii kiss ground arrival country ...,0
1,christina grimmie audition x factor vote win c...,0
2,going somber set tonight curtis painter 's nic...,0
3,"saeed : nowplaying : bep , ricky martin talkin...",0
4,"good morning , yuki~ ! oh , . snow ? riko real...",1


In [12]:
len(twitter_data)

4436

In [14]:
twitter_data.rename(columns={"clean_text": "review", "class":"sentiment"},inplace=True)

save_file("C:\\Users\\Ukachi\\PycharmProjects\\paper_codes\\files\\twitter_data.pkl", twitter_data)