## Import Library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
import re

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to C:\Users\MY
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Reading Data

In [2]:
df_train = pd.read_csv('../data/pre_process/twitter_training.csv')
df_test = pd.read_csv('../data/test/twitter_validation.csv')

## Explore The Data
**Train Data**

In [None]:
df_train.head()

**Test Data**

In [None]:
df_test.head()

In [3]:
df_test.columns = ['TweetId', 'Entity', 'Labels', 'Text']
df_train.columns = ['TweetId', 'Entity', 'Labels', 'Text']

In [None]:
df_train.head()

In [None]:
df_test.head()

In [9]:
df_test.shape

(999, 4)

In [10]:
df_train.shape

(74681, 4)

In [11]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74681 entries, 0 to 74680
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   TweetId  74681 non-null  int64 
 1   Entity   74681 non-null  object
 2   Labels   74681 non-null  object
 3   Text     73995 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [12]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   TweetId  999 non-null    int64 
 1   Entity   999 non-null    object
 2   Labels   999 non-null    object
 3   Text     999 non-null    object
dtypes: int64(1), object(3)
memory usage: 31.3+ KB


In [13]:
df_train.duplicated().sum()

np.int64(2700)

In [14]:
df_test.duplicated().sum()

np.int64(0)

In [15]:
df_train.isnull().sum()

TweetId      0
Entity       0
Labels       0
Text       686
dtype: int64

In [16]:
df_test.isnull().sum()

TweetId    0
Entity     0
Labels     0
Text       0
dtype: int64

## Cleaning the Data

In [4]:
df_train.dropna(inplace=True)
df_train.isnull().sum()

TweetId    0
Entity     0
Labels     0
Text       0
dtype: int64

In [5]:
df_train.drop_duplicates(inplace=True)
df_train.duplicated().sum()

np.int64(0)

**Drop Unneeded Columns**

In [6]:
unneed = ['TweetId', 'Entity']
df_train.drop(columns=unneed, inplace=True)
df_test.drop(columns=unneed, inplace=True)

In [7]:
df_train.head()

Unnamed: 0,Labels,Text
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...


## Preprocessing the Data

In [8]:
def clean_tweet(tweet):    
    tweet = re.sub(r'(http\S+|www\S+|https\S+|\@\w+\'\w?|\#)', '', tweet)  # Also handle @username's (optional apostrophe)
    tweet = re.sub(r'[^A-Za-z\s]', ' ', tweet)  # Keep only letters and spaces
    tweet = re.sub(r'\s+', ' ', tweet).strip()
    return tweet

In [64]:
# def remove_stopwords(tweet):
#     new_tweet=[]
#     for word in tweet.split():
#         if word in stopwords.words('english'):
#             new_tweet.append('')
#         else:
#             new_tweet.append(word)
            
#     x=new_tweet[:]
#     new_tweet.clear()
#     return " ".join(x)

# text="I wasn't sure that this might happened"
# remove_stopwords(text)

'I  sure   might happened'

In [9]:
df_train['Text'] = df_train['Text'].str.lower()
df_test['Text'] = df_test['Text'].str.lower()

In [10]:
# Clean the training data
df_train['Text'] = df_train['Text'].apply(clean_tweet)

# Clean the test data
df_test['Text'] = df_test['Text'].apply(clean_tweet)

In [67]:
# # Clean the training data
# df_train['Text'] = df_train['Text'].apply(remove_stopwords)

# # Clean the test data
# df_test['Text'] = df_test['Text'].apply(remove_stopwords)

In [11]:
df_train.head()

Unnamed: 0,Labels,Text
0,Positive,i am coming to the borders and i will kill you...
1,Positive,im getting on borderlands and i will kill you all
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands and i will murder yo...
4,Positive,im getting into borderlands and i can murder y...


In [12]:
df_test.head()

Unnamed: 0,Labels,Text
0,Neutral,bbc news amazon boss jeff bezos rejects claims...
1,Negative,microsoft why do i pay for word when it functi...
2,Negative,csgo matchmaking is so full of closet hacking ...
3,Neutral,now the president is slapping americans in the...
4,Negative,hi eahelp i ve had madeleine mccann in my cell...


In [13]:
df_train = df_train.dropna(subset=['Text'])
df_test = df_test.dropna(subset=['Text'])

# Drop rows with just whitepace
df_train = df_train[df_train['Text'].str.strip() != '']
df_test = df_test[df_test['Text'].str.strip() != '']

In [14]:
df_train.duplicated().sum()

np.int64(5335)

In [15]:
df_train = df_train.drop_duplicates(subset=['Text'])

In [16]:
df_train.shape

(65748, 2)

## Split the Data

In [20]:
# Separate features and labels for training data
train_texts = df_train['Text'].values
train_labels = df_train['Labels'].values

# Separate features and labels for test data
test_texts = df_test['Text'].values
test_labels = df_test['Labels'].values

In [18]:
inputs = df_train['Text']
target = df_train['Labels']

----------

In [75]:
train_texts

array([' coming borders  kill', 'im getting borderlands kill',
       'im coming borderlands murder', ...,
       ' realized windows partition mac years behind nvidia drivers  idea notice',
       ' realized windows partition mac like years behind nvidia drivers cars  fucking idea  ever notice',
       ' like windows partition mac like years behind drivers  idea  notice'],
      dtype=object)

In [22]:
import math;

def train_naive_bayes(D, C):
    # Calculate P(c) terms
    Ndoc = len(D)
    logprior = {}
    loglikelihood = {}
    V = set()
    
    bigdoc = {c: [] for c in C}
    Nc = {c: 0 for c in C}

    # Count documents per class
    for doc, doc_class in D:
        Nc[doc_class] += 1
        bigdoc[doc_class].extend(doc)
        V.update(doc)
    
    # Calculate log P(c) for each class
    for c in C:
        logprior[c] = math.log(Nc[c] + 1 / Ndoc)
    
    # Calculate log P(w|c) with add-1 smoothing
    for c in C:
        word_count = {}
        total_words_in_class = len(bigdoc[c])
        for word in V:
            word_count[word] = bigdoc[c].count(word)
        loglikelihood[c] = {}
        for word in V:
            loglikelihood[c][word] = math.log((word_count[word] + 1) / (total_words_in_class + len(V)))

    return logprior, loglikelihood, V

def test_naive_bayes(testdoc, logprior, loglikelihood, C, V):
    sum_class = {}
    for c in C:
        sum_class[c] = logprior[c]
        for word in testdoc:
            if word in V:
                sum_class[c] += loglikelihood[c].get(word, 0)
    
    # Return the class with the highest sum
    return max(sum_class, key=sum_class.get)


In [23]:
C = ['Positive', 'Neutral', 'Negative', 'Irrelevant']

D = []

for text in train_texts:
    i = 0
    D.append((text.split(), train_labels[i]))
    i += 1

# D = [
#     (['i', 'am', 'coming', 'to', 'the', 'borders', 'and', 'i', 'will', 'kill', 'you', 'all'], 'Positive'),
#     (['im', 'getting', 'into', 'borderlands', 'and', 'i', 'can', 'murder', 'you', 'all'], 'Positive'),
#     (['for', 'the', 'biggest', 'male', 'dissappoinment', 'in', 'my', 'life', 'came', 'hanging', 'out', 'a', 'year', 'time', 'ago', 'fuck', 'borderlands'], 'Negative'),
#     (['amazon', 'prime', 'usa', 'day', 'is', 'still', 'on', 'delayed', 'but', 'now', 'delayed', 'rocketcitynow', 'com', 'article', 'news', 'h'], 'Negative'),
#     (['watch', 'neighbors', 'stunned', 'as', 'police', 'officer', 'picks', 'up', 'black', 'amazon', 'driver', 'for', 'wrong', 'parking', 'here', 'smartnews', 'link', 'a', 'qyjn', 'or', 'you', 'can', 'access', 'the', 'contents', 'directly', 'by', 'clicking', 'this', 'link', 'here', 'smartnews', 'link', 'w', 'rdea'], 'Neutral'),
#     (['d', 'be', 'be', 'sure', 'to', 'enter', 'the', 'awesome'], 'Neutral'),
#     (['high', 'water', 'fuk', 'yall', 'toxic', 'people'], 'Irrelevant'),
#     (['i', 'couldnt', 'hear', 'it', 'well', 'through', 'the', 'fancams', 'but', 'tys', 'gta', 'verses', 'are', 'sick'], 'Irrelevant')
# ]


In [24]:
# Train the model
logprior, loglikelihood, V = train_naive_bayes(D, C)

In [29]:
len(V)

29249

In [27]:
# Test document
testdoc = ['d', 'be', 'be', 'sure', 'to', 'enter', 'the', 'awesome']


# Predict the class
predicted_class = test_naive_bayes(testdoc, logprior, loglikelihood, C, V)

print("Predicted class:", predicted_class)

Predicted class: Positive


In [87]:
print(train_texts[655].split())


['borderlands', 'please', 'get', 'big', 'hot', 'fix', 'whole', 'factory', 'reset', 'system', 'wipe', 'whenever', 'play', 'borderlands', 'farm', 'crashes']


In [86]:
train_labels[655]

'Negative'

In [88]:
testdoc = ['borderlands', 'please', 'get', 'big', 'hot', 'fix', 'whole', 'factory', 'reset', 'system', 'wipe', 'whenever', 'play', 'borderlands', 'farm', 'crashes']


# Predict the class
predicted_class = test_naive_bayes(testdoc, logprior, loglikelihood, C, V)

print("Predicted class:", predicted_class)

Predicted class: Positive
