In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report , confusion_matrix , accuracy_score

In [2]:
train_data = pd.read_csv("twitter_training.csv" , names=["Id", "About", "labels", "text"])
test_data = pd.read_csv("twitter_validation.csv" , names=["Id", "About", "labels", "text"])

In [3]:
train_data.drop('Id', inplace=True, axis=1)
train_data.drop('About', inplace=True, axis=1)
test_data.drop('Id', inplace=True, axis=1)
test_data.drop('About', inplace=True, axis=1)

train_data.drop(train_data[train_data['labels'] == 'Irrelevant'].index , inplace = True)
test_data.drop(test_data[test_data['labels'] == 'Irrelevant'].index , inplace = True)

In [4]:
train_data['labels'].replace(['Negative' , 'Neutral' , 'Positive'] , [-1 , 0 , 1] , inplace = True)
test_data['labels'].replace(['Negative' , 'Neutral' , 'Positive'] , [-1 , 0 , 1] , inplace = True)

In [5]:
train_data.reset_index()
test_data.reset_index()

Unnamed: 0,index,labels,text
0,1,0,BBC News - Amazon boss Jeff Bezos rejects clai...
1,2,-1,@Microsoft Why do I pay for WORD when it funct...
2,3,-1,"CSGO matchmaking is so full of closet hacking,..."
3,4,0,Now the President is slapping Americans in the...
4,5,-1,Hi @EAHelp I’ve had Madeleine McCann in my cel...
...,...,...,...
823,993,-1,Please explain how this is possible! How can t...
824,994,1,Good on Sony. As much as I want to see the new...
825,997,1,Today sucked so it’s time to drink wine n play...
826,998,1,Bought a fraction of Microsoft today. Small wins.


In [6]:
train_X , test_X , train_Y , test_Y = [] , [] , [] , []  
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()
          
def remove_emojis(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F" 
                               u"\U0001F300-\U0001F5FF"  
                               u"\U0001F680-\U0001F6FF"  
                               u"\U0001F1E0-\U0001F1FF"  
                               u"\U00002500-\U00002BEF"  
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f" 
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
    
def pre_process(text):
    if isinstance(text , str) :
        text = remove_emojis(text)
        text = text.lower()
        text = re.sub(r'\d+' , "" , text)
        text = re.sub(r"http\S+|www\S+|https\S+" , " " , text , flags = re.MULTILINE)
        text = text.translate(str.maketrans("" , "" , string.punctuation))
        text = re.sub(r'\@\w+|\#\w+' , "" , text)
    
        words = word_tokenize(text)
        filtered_words = [word for word in words if word not in stop_words]
                                 
        stemmed_words = [ps.stem(word) for word in filtered_words]
                                 
        lemmatized_words = [lemmatizer.lemmatize(word , pos = 'a') for word in stemmed_words]
        
        return " ".join(lemmatized_words)
    
    
for ind in train_data.index :
    text = train_data['text'][ind]
    label = train_data['labels'][ind]
    if isinstance(text , str):
        temp = pre_process(text)
        if temp == "" or temp is None :
            continue
        train_X.append(temp)
        train_Y.append(label)
                    
for ind in test_data.index :
    text = test_data['text'][ind]
    label = test_data['labels'][ind]
    if isinstance(text , str):
        temp = pre_process(text)
        if temp == "" or temp is None :
            continue
        test_X.append(pre_process(text))
        test_Y.append(label)
    

In [7]:
Tfidf_vector = TfidfVectorizer(ngram_range = (2,2))
traindata = Tfidf_vector.fit_transform(train_X)
testdata = Tfidf_vector.transform(test_X)

In [8]:
multiNB = MultinomialNB()
multiNB.fit(traindata , train_Y)

In [9]:
def evaluation() :
    predictions = multiNB.predict(testdata)
    cm_matrix = confusion_matrix(test_Y , predictions)
    score = accuracy_score(test_Y , predictions)
    report = classification_report(test_Y , predictions)
    print("--------------->CONFUSION MATRIX<----------------")
    print(cm_matrix)
    print("--------------->ACCURACY SCORE<----------------")
    print(score)
    print("--------------->CLASSIFICATION REPORT<----------------")
    print(report)
    return

In [10]:
def test() :
   
    while True :
        txt = input(" ENTER YOUR TWEET , EXIT FOR EXITING ")
        txt = txt.lower()
        if txt == "exit" :
            break
        text = []
        text.append(pre_process(txt))
        test = Tfidf_vector.transform(text)
        prediction = multiNB.predict(test)
        if prediction == 1 :
            print("Tweet is POSITIVE")
        elif prediction == 0 :
            print("Tweet is NEUTRAL")
        else :
            print("Tweet is NEGATIVE")
            
    return

In [11]:
def main() :
    choice = -1
    while True :
        choice = int(input(" 1 : evaluation \n 2 : test  \n anything other : exit \n"))
        if choice == 1 :
            evaluation()
        elif choice == 2:
            test()
        else :
            break
            
    return

In [12]:
main()

 1 : evaluation 
 2 : test  
 anything other : exit 
 1


--------------->CONFUSION MATRIX<----------------
[[264   0   1]
 [  9 273   3]
 [ 14   1 262]]
--------------->ACCURACY SCORE<----------------
0.966142684401451
--------------->CLASSIFICATION REPORT<----------------
              precision    recall  f1-score   support

          -1       0.92      1.00      0.96       265
           0       1.00      0.96      0.98       285
           1       0.98      0.95      0.97       277

    accuracy                           0.97       827
   macro avg       0.97      0.97      0.97       827
weighted avg       0.97      0.97      0.97       827



 1 : evaluation 
 2 : test  
 anything other : exit 
 2
 ENTER YOUR TWEET , EXIT FOR EXITING  He is a bad influence 


Tweet is NEGATIVE


 ENTER YOUR TWEET , EXIT FOR EXITING  exit
 1 : evaluation 
 2 : test  
 anything other : exit 
 0
