In [14]:
from nltk.tokenize import TweetTokenizer, RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier

import re
import pandas as pd
import pickle
import nltk
import preprocessor as p


In [15]:

# importing the dataset
# DATASET_COLUMNS  = ["sentiment", "ids", "date", "flag", "user", "tweet"]
DATASET_ENCODING = "ISO-8859-1"
# dataset = pd.read_csv('./training.1600000.processed.noemoticon.csv', delimiter=',', encoding=DATASET_ENCODING , names=DATASET_COLUMNS)

dataset = pd.read_csv('./Corona_NLP_train.csv', delimiter=',', encoding=DATASET_ENCODING)

# removing the unnecessary columns and duplicates
dataset = dataset[['OriginalTweet', 'Sentiment']]

dataset = dataset.drop_duplicates()

token = RegexpTokenizer(r'[a-zA-Z0-9]+')

# tokenizing and stemming
dataset['tweet'] = dataset['OriginalTweet'].apply(p.clean)
dataset['sentiment'] = dataset['Sentiment']

dataset.head()


Unnamed: 0,OriginalTweet,Sentiment,tweet,sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,and and,Neutral
1,advice Talk to your neighbours family to excha...,Positive,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia: Woolworths to give elde...,Positive,Coronavirus Australia: Woolworths to give elde...,Positive
3,My food stock is not the only one which is emp...,Positive,My food stock is not the only one which is emp...,Positive
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative,"Me, ready to go at supermarket during the outb...",Extremely Negative


In [16]:

X = dataset['tweet']

y = dataset['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

X_train.shape, X_test.shape


((32925,), (8232,))

In [21]:

# creating our pipeline that will return an estimator
pipeline = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1,2), tokenizer=token.tokenize)), ('clf', XGBClassifier(random_state=42))])


In [22]:

parameters = {
    'tfidf__max_features': (10000, 20000),
    'clf__learning_rate': (0.01, 0.001, 0.1, 0.0001),
    }

clf = GridSearchCV(pipeline, param_grid=parameters, cv=5)

clf.fit(X_train, y_train)










































































































































KeyboardInterrupt: 

In [None]:

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))


                    precision    recall  f1-score   support

Extremely Negative       0.55      0.42      0.47      1083
Extremely Positive       0.53      0.50      0.51      1368
          Negative       0.43      0.50      0.46      1958
           Neutral       0.58      0.50      0.53      1538
          Positive       0.43      0.48      0.45      2285

          accuracy                           0.48      8232
         macro avg       0.50      0.48      0.49      8232
      weighted avg       0.49      0.48      0.48      8232



In [None]:

print("Best: %f using %s" % (clf.best_score_, 
    clf.best_params_))
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
params = clf.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))


Best: 0.473106 using {'clf__alpha': 1, 'clf__fit_prior': False, 'tfidf__max_features': 20000}
0.467821 (0.003047) with: {'clf__alpha': 1, 'clf__fit_prior': False, 'tfidf__max_features': 10000}
0.473106 (0.003482) with: {'clf__alpha': 1, 'clf__fit_prior': False, 'tfidf__max_features': 20000}
0.446925 (0.003609) with: {'clf__alpha': 1, 'clf__fit_prior': True, 'tfidf__max_features': 10000}
0.430676 (0.002815) with: {'clf__alpha': 1, 'clf__fit_prior': True, 'tfidf__max_features': 20000}
0.461230 (0.002739) with: {'clf__alpha': 0.1, 'clf__fit_prior': False, 'tfidf__max_features': 10000}
0.463022 (0.002974) with: {'clf__alpha': 0.1, 'clf__fit_prior': False, 'tfidf__max_features': 20000}
0.468914 (0.003437) with: {'clf__alpha': 0.1, 'clf__fit_prior': True, 'tfidf__max_features': 10000}
0.467274 (0.003878) with: {'clf__alpha': 0.1, 'clf__fit_prior': True, 'tfidf__max_features': 20000}
0.460228 (0.004661) with: {'clf__alpha': 0.01, 'clf__fit_prior': False, 'tfidf__max_features': 10000}
0.454366

In [None]:

# exporting the pipeline
pickle.dump(clf, open('./models/mnb_pipeline_grid', 'wb'))