In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from joblib import dump, load

# Get data

In [2]:
df_train = pd.read_csv('twitter_training.csv',header=None)
df_train.columns = ['#', 'refers to', 'sentiment', 'text']
df_train.head()

Unnamed: 0,#,refers to,sentiment,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [3]:
df_test = pd.read_csv('twitter_validation.csv',header=None)
df_test.columns = ['#', 'refers to', 'sentiment', 'text']
df_test.head()

Unnamed: 0,#,refers to,sentiment,text
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


# Feature engineering

In [4]:
[df_test.shape, df_train.shape]

[(1000, 4), (74682, 4)]

In [5]:
print(df_test.isnull().sum(),'\n\n',df_train.isnull().sum())

#            0
refers to    0
sentiment    0
text         0
dtype: int64 

 #              0
refers to      0
sentiment      0
text         686
dtype: int64


In [6]:
df_train.dropna(inplace=True, axis=0)

In [7]:
ids_to_remove = [1826, 10454, 32186, 68078]
df_train = df_train[~df_train.index.isin(ids_to_remove)]
df_train.index = range(len(df_train))

In [8]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

X_train = df_train['text']
y_train = le.fit_transform(df_train['sentiment'])

X_test = df_test['text']
y_test = le.fit_transform(df_test['sentiment'])

In [9]:
print(np.unique(y_test))
print(le.inverse_transform(np.unique(y_test)))

[0 1 2 3]
['Irrelevant' 'Negative' 'Neutral' 'Positive']


In [10]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))
len(stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ppadr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


179

In [54]:
dump(stopwords, 'stopwords.data') 

['stopwords.data']

In [11]:
import spacy
nlp = spacy.load("en_core_web_sm")

from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import regex as re

In [55]:
dump(nlp, 'nlp.path') 
dump(stopwords, 'stopwords.data') 

['stopwords.data']

In [12]:
class Preprocessor:
    def __init__(self, stopwords=stopwords):
        self.vectorizer = TfidfVectorizer(lowercase=False, max_features=8000,
                                         min_df=10, ngram_range=(1, 3),
                                         tokenizer=None)
        self.stopwords = stopwords
        self.vectorizer_fitted = False
        
    def remove_urls(self, texts):
        print('Removing URLs...')
        pattern = re.compile('(\w+\.com ?/ ?.+)|(http\S+)')
        return [re.sub(pattern, '', text) for text in texts]
    
    def remove_double_space(self, texts):
        print('Removing double space...')
        pattern = re.compile(' +')
        return [re.sub(pattern, ' ', text) for text in texts]
        
    def remove_punctuation(self, texts):
        print('Removing Punctuation...')
        pattern = re.compile('[^a-z ]')
        return [re.sub(pattern, ' ', text) for text in texts]
    
    def remove_stopwords(self, texts):
        print('Removing stopwords...')
        return [[w for w in text.split(' ') if w not in self.stopwords] for text in tqdm(texts)]
    
    def remove_numbers(self, texts):
        print('Removing numbers...')
        return [' '.join([w for w in text if not w.isdigit()]) for text in tqdm(texts)]
    
    def remove_emojis(self, texts):
        print('Removing emojis...')
        pattern = re.compile("["
                u"\U0001F600-\U0001F64F"  # emoticons
                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   "]+", flags=re.UNICODE)
        return [re.sub(pattern, r'', text) for text in texts]
    
    def lemmatize(self, texts):
        print('Lemmatizing...')
        lemmatized_texts = []
        for text in tqdm(texts):
            doc = nlp(text)
            lemmatized_texts.append(' '.join([token.lemma_ for token in doc]))
                                    
        return lemmatized_texts
        
    def transform(self, X, y=None, mode='train'):
        X = X.copy()
        
        print('Removing Nans...')
        X = X[~X.isnull()]                          
        X = X[~X.duplicated()]                      
        
        if mode == 'train':
            self.train_idx = X.index
        else:
            self.test_idx = X.index
            
        print('Counting capitalized...')
        capitalized = [np.sum([t.isupper() for t in text.split()]) 
                           for text in np.array(X.values)]  
        
        print('Lowering...')
        X = [text.lower() for text in X]   
        
        X = self.remove_urls(X)                      
        X = self.remove_punctuation(X)               
        X = self.remove_double_space(X)             
        X = self.remove_emojis(X)                   
        X = self.remove_stopwords(X)                 
        X = self.remove_numbers(X)                                        
        X = self.lemmatize(X)                        
        
        if not self.vectorizer_fitted:
            self.vectorizer_fitted = True
            print('Fitting vectorizer...')
            self.vectorizer.fit(X)

        print('Vectorizing...')
        X = self.vectorizer.transform(X)       
        
        return X

In [13]:
pr = Preprocessor()

In [14]:
data_train = pr.transform(X_train)
data_train = pd.DataFrame.sparse.from_spmatrix(data_train, columns=pr.vectorizer.get_feature_names_out())
y_train = pd.DataFrame(y_train)
y_train = y_train[y_train.index.isin(pr.train_idx)]
y_train.index = data_train.index

Removing Nans...
Counting capitalized...
Lowering...
Removing URLs...
Removing Punctuation...
Removing double space...
Removing emojis...
Removing stopwords...


100%|█████████████████████████████████████████████████████████████████████████| 69488/69488 [00:01<00:00, 37269.60it/s]


Removing numbers...


100%|████████████████████████████████████████████████████████████████████████| 69488/69488 [00:00<00:00, 142066.48it/s]


Lemmatizing...


100%|████████████████████████████████████████████████████████████████████████████| 69488/69488 [24:18<00:00, 47.65it/s]


Fitting vectorizer...
Vectorizing...


In [15]:
X_train = data_train
[data_train.shape, y_train.values.ravel().shape]

[(69488, 8000), (69488,)]

In [16]:
data_test = pr.transform(X_test, mode='test')
data_test = pd.DataFrame.sparse.from_spmatrix(data_test, columns=pr.vectorizer.get_feature_names_out())
y_test = pd.DataFrame(y_test)
y_test = y_test[y_test.index.isin(pr.test_idx)]
y_test.index = data_test.index

Removing Nans...
Counting capitalized...
Lowering...
Removing URLs...
Removing Punctuation...
Removing double space...
Removing emojis...
Removing stopwords...


100%|█████████████████████████████████████████████████████████████████████████████| 999/999 [00:00<00:00, 49924.46it/s]


Removing numbers...


100%|████████████████████████████████████████████████████████████████████████████| 999/999 [00:00<00:00, 166399.65it/s]


Lemmatizing...


100%|████████████████████████████████████████████████████████████████████████████████| 999/999 [00:19<00:00, 50.38it/s]


Vectorizing...


In [17]:
X_test = data_test
[X_test.shape, y_test.values.ravel().shape]

[(999, 8000), (999,)]

In [39]:
dump(pr.vectorizer, 'vectorizer.model') 

['vectorizer.model']

# Classification 

In [20]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(random_state=42, max_iter=1000)
log_reg.fit(X_train, y_train.values.ravel())

dump(log_reg, 'log_reg.model') 

['log_reg.model']

In [21]:
clf = load('log_reg.model') 
y_pred  = clf.predict(X_test)

In [22]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, target_names = le.inverse_transform(np.unique(y_test))))

              precision    recall  f1-score   support

  Irrelevant       0.90      0.78      0.84       172
    Negative       0.81      0.90      0.85       266
     Neutral       0.88      0.79      0.83       285
    Positive       0.82      0.89      0.85       276

    accuracy                           0.84       999
   macro avg       0.85      0.84      0.84       999
weighted avg       0.85      0.84      0.84       999



In [23]:
from sklearn.tree import DecisionTreeClassifier

## Criterion: entropy or gini
classifier = DecisionTreeClassifier(max_depth=600, criterion='gini', random_state = 42) 
classifier.fit(X_train,y_train)

dump(classifier, 'tree.model') 

['tree.model']

In [24]:
clf = load('tree.model') 
y_pred  = clf.predict(X_test)

In [25]:
print(classification_report(y_test, y_pred, target_names = le.inverse_transform(np.unique(y_test))))

              precision    recall  f1-score   support

  Irrelevant       0.93      0.91      0.92       172
    Negative       0.90      0.93      0.92       266
     Neutral       0.90      0.91      0.90       285
    Positive       0.92      0.90      0.91       276

    accuracy                           0.91       999
   macro avg       0.91      0.91      0.91       999
weighted avg       0.91      0.91      0.91       999



In [26]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=600, random_state=42)
clf.fit(X_train, y_train.values.ravel())

dump(clf, 'forest.model') 

['forest.model']

In [27]:
clf = load('forest.model') 
y_pred  = clf.predict(X_test)

In [28]:
print(classification_report(y_test, y_pred, target_names = le.inverse_transform(np.unique(y_test))))

              precision    recall  f1-score   support

  Irrelevant       0.99      0.92      0.95       172
    Negative       0.94      0.96      0.95       266
     Neutral       0.96      0.94      0.95       285
    Positive       0.92      0.96      0.94       276

    accuracy                           0.95       999
   macro avg       0.95      0.95      0.95       999
weighted avg       0.95      0.95      0.95       999



# All together

In [29]:
log_reg = load('log_reg.model') 
y_lr  = log_reg.predict(X_test)

tree = load('tree.model') 
y_tree  = tree.predict(X_test)

forest = load('forest.model') 
y_forest  = forest.predict(X_test)

In [30]:
from statistics import mode

y_pred = []

for i in range(len(y_test)):
    
    r = mode([y_lr[i], y_tree[i], y_forest[i]])
    
    y_pred.append(r)

In [31]:
print(classification_report(y_test, y_pred, target_names = le.inverse_transform(np.unique(y_test))))

              precision    recall  f1-score   support

  Irrelevant       0.99      0.93      0.96       172
    Negative       0.95      0.95      0.95       266
     Neutral       0.96      0.94      0.95       285
    Positive       0.91      0.96      0.93       276

    accuracy                           0.95       999
   macro avg       0.95      0.95      0.95       999
weighted avg       0.95      0.95      0.95       999



In [32]:
prueba = data_test[0:1].values

x_std = prueba

log_reg = load('log_reg.model') 
y_lr  = log_reg.predict(x_std)

tree = load('tree.model') 
y_tree  = tree.predict(x_std)

forest = load('forest.model') 
y_forest  = forest.predict(X_test)

r = [y_lr[0], y_tree[0], y_forest[0]]

mode(r)



0

In [33]:
y_test[0:1]

Unnamed: 0,0
0,0


In [40]:
r = load('vectorizer.model') 