In [1]:
import pandas as pd 
import re
import string

import joblib

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from sklearn.naive_bayes import MultinomialNB

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [2]:
df = pd.read_csv('my_cleaned.csv')
df.drop('tag', axis=1, inplace=True)

In [3]:
le = {'positive':1, 'neutral':0, 'negative':-1}
df['class'] = df['class'].map(le)

In [4]:
class CleanText(BaseEstimator, TransformerMixin):
    
    def remove_mentions(self, input_text):
        return re.sub(r'@\w+', '', input_text)
    
    def remove_hashtags(self, input_text):
        return re.sub(r'#([^\s]+)', '', input_text)
    
    def remove_newlines(self, input_text):
        return input_text.replace('\n', '')
    
    def remove_urls(self, input_text):
        return re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))', '', input_text)
    
    def emoji_oneword(self, input_text):
        emoji_pattern = re.compile(
            "["
            "\U0001F1E0-\U0001F1FF"  # flags (iOS)
            "\U0001F300-\U0001F5FF"  # symbols & pictographs
            "\U0001F600-\U0001F64F"  # emoticons
            "\U0001F680-\U0001F6FF"  # transport & map symbols
            "\U0001F700-\U0001F77F"  # alchemical symbols
            "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
            "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
            "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
            "\U0001FA00-\U0001FA6F"  # Chess Symbols
            "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
            "\U00002702-\U000027B0"  # Dingbats
            "\U000024C2-\U0001F251" 
            "]+")
        return emoji_pattern.sub('', input_text)
    
    def remove_puntuaction(self, input_text):
        punct = string.punctuation
        trantab = str.maketrans(punct, len(punct)*' ')
        return input_text.translate(trantab)
    
    def to_lower(self, input_text):
        return input_text.lower()
    
    def remove_stopwords(self, input_text):
        stopwords_list = stopwords.words('english')
        whitelist = ["n't", "not", "no"]
        words = input_text.split()
        clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1]
        return ' '.join(clean_words)
    
    def lemmatizing(self, input_text):
        lematize = WordNetLemmatizer()
        sentence_words = word_tokenize(input_text)
        stemmed_words = [lematize.lemmatize(word, pos='v') for word in sentence_words]
        return ' '.join(stemmed_words)
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, **transform_params):
        clean_X = X.apply(self.remove_mentions).apply(self.remove_hashtags).apply(self.remove_newlines).apply(self.remove_urls).apply(self.emoji_oneword).apply(self.remove_puntuaction).apply(self.to_lower).apply(self.remove_stopwords).apply(self.lemmatizing)
        return clean_X

In [5]:
class Train_and_test:
    def __init__(self, df, test_size=0.3, save_model=False, vectorizer='count', seed=42):
        self._df = df
        self._test_size = test_size
        self._save_model = save_model
        self._vectorizer = vectorizer
        self._seed = seed
        self._text_cleaner = CleanText()
        self._countvect = CountVectorizer()
        self._tfidf_vect = TfidfVectorizer()
        self._nb_tf_model = MultinomialNB()
        self._nb_count_model = MultinomialNB()
        self.x_test = None
        self.y_test = None
        self.predict = None
    
    
    def train_and_test(self):
        cleaned_data = self._text_cleaner.fit_transform(self._df.tweet_text)
        self._df.tweet_text = cleaned_data
        X = self._df.tweet_text
        y = self._df['class']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self._test_size, random_state=self._seed, stratify=y)
        
        self.y_test = pd.DataFrame(y_test, columns=['class'])
        self.y_test.to_csv('y_test.csv', index=False)

        if self._vectorizer == 'count':
            X_count_train = self._countvect.fit_transform(X_train)
            self._nb_count_model.fit(X_count_train, y_train)

            X_count_test = self._countvect.transform(X_test)
            self.x_test = pd.DataFrame(X_count_test, columns=['class'])
            self.x_test.to_csv('X_test.csv', index=False)
            
            nb_count_predict = self._nb_count_model.predict(X_count_test)
            
            self.predict = pd.DataFrame(nb_count_predict, columns=['class'])
            self.predict.to_csv('prediction.csv', index=False)

            print('The scores for the model using CountVectorizer...')
            print("F1 Score: ", f1_score(y_test, nb_count_predict, average='macro') )
            print(f"Accuracy Score: ", accuracy_score(y_test, nb_count_predict))

            if self._save_model:
                joblib.dump(self._nb_count_model, 'nb_count_model.pkl')
                print("Model saved as 'nb_count_model.pkl'!")

        elif self._vectorizer == 'tfidf':
            X_tf_train = self._tfidf_vect.fit_transform(X_train)
            self._nb_tf_model.fit(X_tf_train, y_train)

            X_tfidf_test = self._tfidf_vect.transform(X_test)
            self.x_test = pd.DataFrame(X_tfidf_test, columns=['class'])
            self.x_test.to_csv('X_test.csv', index=False)
            
            nb_tf_predict = self._nb_tf_model.predict(X_tfidf_test)
            
            self.predict = pd.DataFrame(nb_tf_predict, columns=['class'])
            self.predict.to_csv('prediction.csv', index=False)

            print('The scores for the model using TfidfVectorizer...')
            print("F1 Score: ", f1_score(y_test, nb_tf_predict, average='micro'))
            print("Accuracy Score: ", accuracy_score(y_test, nb_tf_predict))

            if self._save_model:
                joblib.dump(self._nb_tf_model, 'nb_tf_model.pkl')
                print("Model saved as 'nb_tf_model.pkl'!")
        else:
            print("Invalid vectorizer. Choose between 'count' and 'tfidf' ")
        
    
    
    def user_predict(self, input_text):
        input_text = pd.Series(input_text)
        clean_text = self._text_cleaner.transform(input_text)

        if self._vectorizer == 'count':
            count_text = self._countvect.transform(clean_text)
            predict = self._nb_count_model.predict(count_text).item()
            if predict == 1:
                print('The input text is positive.')
            elif predict == 0:
                print('The input text is neutral.')
            else:
                print('the input text is negative.')
        elif self._vectorizer == 'tfidf':
            count_text = self._tfidf_vect.transform(clean_text)
            predict = self._nb_tf_model.predict(count_text).item()
            if predict == 1:
                print('The input text is positive.')
            elif predict == 0:
                print('The input text is neutral.')
            else:
                print('the input text is negative.')
        else:
            print("Invalid vectorizer. Choose between 'count' and 'tfidf' ")

In [6]:
my_class = Train_and_test(df)

In [7]:
my_class.train_and_test()

The scores for the model using CountVectorizer...
F1 Score:  0.7804102286075719
Accuracy Score:  0.847457627118644


In [8]:
# my_class.user_predict('it was bad')