In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

import spacy

import warnings
warnings.filterwarnings('ignore')

In [2]:
col = ['id','country','Label','Text']
data = pd.read_csv("twitter_training.csv", names=col)

In [3]:
data.head()


Unnamed: 0,id,country,Label,Text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [4]:
data.tail()

Unnamed: 0,id,country,Label,Text
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...
74681,9200,Nvidia,Positive,Just like the windows partition of my Mac is l...


In [5]:
data.shape

(74682, 4)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       74682 non-null  int64 
 1   country  74682 non-null  object
 2   Label    74682 non-null  object
 3   Text     73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [7]:
data['Label'].value_counts()

Label
Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64

In [8]:
print(f"{data['Text'][2]} -> {data['Label'][2]}")

im getting on borderlands and i will kill you all, -> Positive


In [9]:
data.dropna(inplace=True)
# Preprocess Function
nlp = spacy.load("en_core_web_sm") 
def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 
data['Preprocessed Text'] = data['Text'].apply(preprocess) 

In [10]:
data

Unnamed: 0,id,country,Label,Text,Preprocessed Text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,m get borderland murder
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,come border kill
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,m get borderland kill
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,m come borderland murder
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,m get borderland 2 murder
...,...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...,realize Windows partition Mac like 6 year Nvid...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...,realize Mac window partition 6 year Nvidia dri...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...,realize window partition Mac 6 year Nvidia dri...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...,realize window partition Mac like 6 year Nvidi...


In [11]:
le = LabelEncoder()
data['Label'] = le.fit_transform(data['Label'])

In [12]:
data

Unnamed: 0,id,country,Label,Text,Preprocessed Text
0,2401,Borderlands,3,im getting on borderlands and i will murder yo...,m get borderland murder
1,2401,Borderlands,3,I am coming to the borders and I will kill you...,come border kill
2,2401,Borderlands,3,im getting on borderlands and i will kill you ...,m get borderland kill
3,2401,Borderlands,3,im coming on borderlands and i will murder you...,m come borderland murder
4,2401,Borderlands,3,im getting on borderlands 2 and i will murder ...,m get borderland 2 murder
...,...,...,...,...,...
74677,9200,Nvidia,3,Just realized that the Windows partition of my...,realize Windows partition Mac like 6 year Nvid...
74678,9200,Nvidia,3,Just realized that my Mac window partition is ...,realize Mac window partition 6 year Nvidia dri...
74679,9200,Nvidia,3,Just realized the windows partition of my Mac ...,realize window partition Mac 6 year Nvidia dri...
74680,9200,Nvidia,3,Just realized between the windows partition of...,realize window partition Mac like 6 year Nvidi...


In [13]:
x_train, x_test, y_train, y_test = train_test_split(data['Preprocessed Text'], data['Label'], 
                                                    test_size=0.2, random_state=42, stratify=data['Label'])

In [14]:
x_test.shape

(14800,)

In [15]:
x_train.shape

(59196,)

In [16]:
clf = Pipeline([
    ('vectorizer_tri_grams', TfidfVectorizer()),
    ('naive_bayes', (MultinomialNB()))         
])

In [17]:
clf.fit(x_train, y_train)

In [18]:
y_pred = clf.predict(x_test)

In [19]:
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.7312837837837838
              precision    recall  f1-score   support

           0       0.95      0.46      0.62      2575
           1       0.65      0.90      0.76      4472
           2       0.84      0.63      0.72      3622
           3       0.71      0.81      0.76      4131

    accuracy                           0.73     14800
   macro avg       0.79      0.70      0.71     14800
weighted avg       0.77      0.73      0.72     14800



In [20]:
clf = Pipeline([
    ('vectorizer_tri_grams', TfidfVectorizer()),
    ('naive_bayes', (RandomForestClassifier()))         
])

In [21]:
clf.fit(x_train, y_train)

In [22]:
y_pred = clf.predict(x_test)

In [23]:
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9131756756756757
              precision    recall  f1-score   support

           0       0.97      0.86      0.91      2575
           1       0.93      0.93      0.93      4472
           2       0.94      0.89      0.92      3622
           3       0.85      0.94      0.90      4131

    accuracy                           0.91     14800
   macro avg       0.92      0.91      0.91     14800
weighted avg       0.92      0.91      0.91     14800



In [24]:
#pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.1-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.2/124.9 MB 5.0 MB/s eta 0:00:26
   ---------------------------------------- 0.7/124.9 MB 7.5 MB/s eta 0:00:17
   ---------------------------------------- 1.3/124.9 MB 8.9 MB/s eta 0:00:14
    --------------------------------------- 1.7/124.9 MB 10.0 MB/s eta 0:00:13
    --------------------------------------- 2.1/124.9 MB 10.2 MB/s eta 0:00:13
    --------------------------------------- 2.6/124.9 MB 9.7 MB/s eta 0:00:13
    --------------------------------------- 2.9/124.9 MB 9.6 MB/s eta 0:00:13
    --------------------------------------- 3.0/124.9 MB 9.0 MB/s eta 0:00:14
   - -------------------------------------- 3.6/124.9 MB 9.0 MB/s eta 0:00:14
   - -------------------------------------- 3.8/124.9 MB 8.6 MB/s eta 0:00:15


[notice] A new release of pip is available: 23.3.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [25]:
from xgboost import XGBClassifier

In [26]:

clf = Pipeline([
    ('vectorizer_tri_grams', TfidfVectorizer()),
    ('naive_bayes', (XGBClassifier()))         
])

In [27]:
clf.fit(x_train, y_train)

In [28]:
y_pred = clf.predict(x_test)

In [29]:
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.6462162162162162
              precision    recall  f1-score   support

           0       0.74      0.35      0.47      2575
           1       0.59      0.84      0.69      4472
           2       0.71      0.58      0.64      3622
           3       0.66      0.68      0.67      4131

    accuracy                           0.65     14800
   macro avg       0.67      0.61      0.62     14800
weighted avg       0.66      0.65      0.64     14800



In [30]:
from sklearn.ensemble import GradientBoostingClassifier

In [31]:

clf = Pipeline([
    ('vectorizer_tri_grams', TfidfVectorizer()),
    ('naive_bayes', (GradientBoostingClassifier()))         
])

In [32]:
clf.fit(x_train, y_train)

In [33]:
y_pred = clf.predict(x_test)

In [34]:
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.5350675675675676
              precision    recall  f1-score   support

           0       0.66      0.16      0.26      2575
           1       0.47      0.82      0.60      4472
           2       0.62      0.44      0.52      3622
           3       0.59      0.54      0.56      4131

    accuracy                           0.54     14800
   macro avg       0.59      0.49      0.48     14800
weighted avg       0.57      0.54      0.51     14800



In [35]:
test_data = pd.read_csv('twitter_validation.csv', names=col)

In [36]:
test_txt = test_data['Text'][25]
print(f"{test_txt} ===> {test_data['Label'][25]}")

#gtc20 -  nice, motivational, and very accessible Nvidia/AI product fair + related tech talks
nvidia.com/en-us/gtc/keyn…
interesting interaction/social activities: braindates, dinner with strangers, ...  and free attendance for universities: reg.rainfocus.com/flow/nvidia/gt… ===> Neutral


In [37]:
# Apply preprocess

test_txt_processed = [preprocess(test_txt)]
test_txt_processed

['gtc20   nice motivational accessible Nvidia AI product fair + related tech talk \n nvidia.com/en-us/gtc/keyn \n interesting interaction social activity braindate dinner stranger   free attendance university reg.rainfocus.com/flow/nvidia/gt']

In [38]:
# Get Prediction

test_txt = clf.predict(test_txt_processed)
classes = ['Irrelevant', 'Natural', 'Negative', 'Positive']

print(f"True Label: {test_data['Label'][25]}")
print(f'Predict Label: {classes[test_txt[0]]}')

True Label: Neutral
Predict Label: Negative
