In [1]:
import joblib
from nltk.corpus import stopwords
import string
from sklearn.linear_model import LogisticRegression
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import TweetTokenizer, RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import re
import pandas as pd
import pickle
import nltk
import numpy as np


nltk.download('stopwords')

stemmer = SnowballStemmer("english", ignore_stopwords=True)
token = RegexpTokenizer(r'[a-zA-Z0-9]+')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Enes\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:

# importing the dataset
DATASET_ENCODING = "ISO-8859-1"
# DATASET_COLUMNS  = ["sentiment", "ids", "date", "flag", "user", "tweet"]
# dataset = pd.read_csv('./training.1600000.processed.noemoticon.csv', delimiter=',', encoding=DATASET_ENCODING , names=DATASET_COLUMNS)

dataset = pd.read_csv('./IMDB Dataset.csv', delimiter=',',
                      encoding=DATASET_ENCODING)
# dataset = pd.read_csv('./Corona_NLP_train.csv',
#   delimiter=',', encoding=DATASET_ENCODING)
dataset_dir = 'imdb'
# dataset_dir = 'coronaNLP'
# dataset_dir = 'sentiment140'
model_dir = './models/'+dataset_dir
vector_dir = './vectors/'+dataset_dir

# removing the unnecessary columns and duplicates
# dataset = dataset[['OriginalTweet','Sentiment']]
# dataset = dataset[['tweet','sentiment']]
dataset = dataset[['review', 'sentiment']]
dataset = dataset.drop_duplicates()

# dataset.head()


# Preprocessing


def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)


def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)


def remove_mention(text):
    return re.sub("@[A-Za-z0-9]+", "", text)


def stem_tweets(tweet):
    tokens = tweet.split()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)


# remove stopwords


stop = set(stopwords.words("english"))


def remove_stopwords(text):
    stop = set(stopwords.words("english"))

    filtered_words = [word.lower()
                      for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)


def preprocess_tweets(tweet):
    tweet = remove_mention(tweet)
    tweet = remove_URL(tweet)
    tweet = remove_punct(tweet)
    tweet = stem_tweets(tweet)
    tweet = remove_stopwords(tweet)
    return tweet



In [3]:

X = dataset['review']
# X = dataset['tweet']
# X = dataset['OriginalTweet']

X = X.apply(preprocess_tweets)

y = dataset['sentiment']
# y = dataset['Sentiment']


In [4]:
X

0        one review mention watch 1 oz episod youll hoo...
1        wonder littl product br br film techniqu unass...
2        thought wonder way spend time hot summer weeke...
3        basic famili littl boy jake think zombi closet...
4        petter mattei love time money visual stun film...
                               ...                        
49995    thought movi right good job wasnt creativ orig...
49996    bad plot bad dialogu bad act idiot direct anno...
49997    cathol taught parochi elementari school nun ta...
49998    im go disagre previous comment side maltin one...
49999    one expect star trek movi high art fan expect ...
Name: review, Length: 50000, dtype: object

In [5]:

tfidf = pickle.load(open(f"./vectors/{dataset_dir}/tfidf_mnb_87", "rb"))


0        one review mention watch 1 oz episod youll hoo...
1        wonder littl product br br film techniqu unass...
2        thought wonder way spend time hot summer weeke...
3        basic famili littl boy jake think zombi closet...
4        petter mattei love time money visual stun film...
                               ...                        
49995    thought movi right good job wasnt creativ orig...
49996    bad plot bad dialogu bad act idiot direct anno...
49997    cathol taught parochi elementari school nun ta...
49998    im go disagre previous comment side maltin one...
49999    one expect star trek movi high art fan expect ...
Name: review, Length: 50000, dtype: object

In [6]:

X = tfidf.transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)


In [7]:
X_train, y_train

(<40000x20000 sparse matrix of type '<class 'numpy.float64'>'
 	with 3542310 stored elements in Compressed Sparse Row format>,
 39087    negative
 30893    negative
 45278    positive
 16398    negative
 13653    negative
            ...   
 11284    positive
 44732    positive
 38158    negative
 860      positive
 15795    positive
 Name: sentiment, Length: 40000, dtype: object)

In [8]:

X_train.shape, X_test.shape


((40000, 20000), (10000, 20000))

In [11]:

# creating our pipeline that will return an estimator
pipeline = Pipeline([('clf', LogisticRegression(random_state=42, multi_class='ovr', n_jobs=4, verbose=1))])


In [19]:

parameters = {
    'clf__penalty': ('l2', 'none'),
    'clf__solver': ('sag', 'saga')
}

clf = GridSearchCV(pipeline, param_grid=parameters, cv=5, verbose=1)

clf = clf.fit(X_train, y_train)


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Usi

convergence after 18 epochs took 2 seconds


[Parallel(n_jobs=4)]: Done   1 out of   1 | elapsed:    1.8s finished


In [21]:

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

print("Best: %f using %s" % (clf.best_score_,
                             clf.best_params_))
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
params = clf.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

clf.best_estimator_

              precision    recall  f1-score   support

    negative       0.90      0.87      0.89      4961
    positive       0.88      0.91      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

[[4336  625]
 [ 475 4564]]
Best: nan using {'clf__penalty': 'l2', 'clf__solver': 'sag'}
nan (nan) with: {'clf__penalty': 'l2', 'clf__solver': 'sag'}
nan (nan) with: {'clf__penalty': 'l2', 'clf__solver': 'saga'}
nan (nan) with: {'clf__penalty': 'none', 'clf__solver': 'sag'}
nan (nan) with: {'clf__penalty': 'none', 'clf__solver': 'saga'}


Pipeline(steps=[('clf',
                 LogisticRegression(multi_class='ovr', n_jobs=4,
                                    random_state=42, solver='sag',
                                    verbose=1))])

In [None]:

acc = int(accuracy_score(y_test, y_pred)*100)

# exporting the pipeline
# pickle.dump(clf, open(f'{model_dir}/LRG_model_{acc}', 'wb'))
# pickle.dump(pipeline['tfidf'], open(f'{vector_dir}/tfidf_lrg_{acc}', 'wb'))

joblib.dump(clf.best_estimator_, f'{model_dir}/LRG_model_{acc}.pkl')
