In [10]:
import pandas as pd
import numpy as np

In [14]:
import re
from collections import defaultdict
import math

In [12]:
df=pd.read_csv('./train.csv')
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [17]:
def preprocess(text):
    if not isinstance(text, str):
        return []  
    
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text.split()


In [18]:
df['words'] = df['text'].apply(preprocess)

In [19]:
df.head()

Unnamed: 0,textID,text,selected_text,sentiment,words
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,"[id, have, responded, if, i, were, going]"
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,"[sooo, sad, i, will, miss, you, here, in, san,..."
2,088c60f138,my boss is bullying me...,bullying me,negative,"[my, boss, is, bullying, me]"
3,9642c003ef,what interview! leave me alone,leave me alone,negative,"[what, interview, leave, me, alone]"
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,"[sons, of, why, couldnt, they, put, them, on, ..."


In [28]:
class NaiveBayesClassifier:

    def fit(self, X, y):
        self.classes= set(y)
        self.class_count = defaultdict(int)
        self.word_count = defaultdict(lambda: defaultdict(int))
        self.total_words = defaultdict(int)

        for words, label in zip(X, y):
            self.class_count[label] += 1

            for word in words:
                self.word_count[label][word] += 1
                self.total_words[label] += 1
        
        self.vocab= set(word for text in X for word in text)

    def predict(self, X):

        preds=[]

        for words in X:
            scores={}

            for c in self.classes:

                log_prob= math.log(self.class_count[c]/ sum(self.class_count.values()))
                for word in words:

                    freq=self.word_count[c][word]
                    prob= (freq+1)/(self.total_words[c]+ len(self.vocab))
                    log_prob += math.log(prob)
                scores[c]=log_prob
            preds.append(max(scores, key=scores.get))
        return preds
                    

In [21]:
X=df['words']
y=df['sentiment']


In [29]:
nb= NaiveBayesClassifier()
nb.fit(X, y)

In [30]:
predictions=nb.predict(X)
accuracy = sum(p == t for p, t in zip(predictions, y)) / len(y)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.84


In [31]:
df_test= pd.read_csv('./test.csv')
df_test.head()

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive


In [32]:
df_test['words']= df_test['text'].apply(preprocess)
df_test

Unnamed: 0,textID,text,sentiment,words
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,"[last, session, of, the, day, httptwitpiccom67..."
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,"[shanghai, is, also, really, exciting, precise..."
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,"[recession, hit, veronique, branquinho, she, h..."
3,01082688c6,happy bday!,positive,"[happy, bday]"
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,"[httptwitpiccom4w75p, i, like, it]"
...,...,...,...,...
3529,e5f0e6ef4b,"its at 3 am, im very tired but i can`t sleep ...",negative,"[its, at, 3, am, im, very, tired, but, i, cant..."
3530,416863ce47,All alone in this old house again. Thanks for...,positive,"[all, alone, in, this, old, house, again, than..."
3531,6332da480c,I know what you mean. My little dog is sinkin...,negative,"[i, know, what, you, mean, my, little, dog, is..."
3532,df1baec676,_sutra what is your next youtube video gonna b...,positive,"[_sutra, what, is, your, next, youtube, video,..."


In [33]:
predictions_X_test= nb.predict(df_test['words'])
predictions_X_test

['positive',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'negative',
 'neutral',
 'neutral',
 'negative',
 'neutral',
 'neutral',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'positive',
 'neutral',
 'negative',
 'neutral',
 'negative',
 'neutral',
 'neutral',
 'positive',
 'neutral',
 'neutral',
 'neutral',
 'negative',
 'neutral',
 'positive',
 'negative',
 'neutral',
 'neutral',
 'positive',
 'positive',
 'negative',
 'negative',
 'neutral',
 'positive',
 'negative',
 'negative',
 'neutral',
 'neutral',
 'positive',
 'neutral',
 'positive',
 'negative',
 'positive',
 'positive',
 'neutral',
 'negative',
 'positive',
 'neutral',
 'positive',
 'neutral',
 'neutral',
 'neutral',
 'positive',
 'neutral',
 'neutral',
 'negative',
 'negative',
 'neutral',
 'negative',
 'positive',
 'negative',
 'negative',
 'neutral',
 'positive',
 'positive',
 'negative',
 'positive',
 'positive',
 'neutral',
 'neutral',
 'negative',
 'neutral',
 'n