In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


import random
import re                                  
import string  

import nltk

import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [2]:
simplified_emotions = pd.read_csv(r"C:\Users\domin\Desktop\Year 2 Block C\2023-24c-fai2-adsai-DominikSzewczyk224180\Datasets\simplified_emotions.csv")

In [3]:
simplified_emotions

Unnamed: 0,sentence,emotion
0,What?,other
1,Hey!,happiness
2,Where?!,other
3,"No, I know!",other
4,Well! Well! Well! Joey Tribbiani! So you came ...,other
...,...,...
96725,I am glad that you have been happy with my per...,other
96726,That sounds fair .,other
96727,"Lindsay , of course , I will be more than happ...",other
96728,It's hard for us to believe it . Our instrumen...,other


In [4]:
happiness_df = simplified_emotions[simplified_emotions['emotion'] == 'happiness']
other_df = simplified_emotions[simplified_emotions['emotion'] != 'happiness']

In [5]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\domin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\domin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
def sentence_processor(sentence):
    sentence = re.sub(r'http\S+', '', sentence)

    sentence = sentence.replace('#', '')

    tokenizer = word_tokenize

    processed_sentence = tokenizer(sentence)

    stopwords_english = set(stopwords.words('english'))
    processed_sentence = [word for word in processed_sentence if word.lower() not in stopwords_english]

    processed_sentence = [word for word in processed_sentence if word.lower() not in string.punctuation]

    stemmer = PorterStemmer()
    processed_sentence = [stemmer.stem(word) for word in processed_sentence]

    return processed_sentence

In [7]:
sentence_processor("Ok I get that, like I said, different morals")

['ok', 'get', 'like', 'said', 'differ', 'moral']

In [8]:
def sentence_processor_df(df):
    processed_sentences_list = []
    for sentence in df['sentence']:
        processed_sentence = sentence_processor(sentence)
        processed_sentences_list.append(processed_sentence)
    return processed_sentences_list

In [9]:
from sklearn.model_selection import train_test_split

happy_sentences_tr, happy_sentences_te = train_test_split(happiness_df, test_size=0.2, random_state=42)

happy_sentences_tr = sentence_processor_df(happy_sentences_tr)
happy_sentences_te = sentence_processor_df(happy_sentences_te)

other_sentences_tr, other_sentences_te = train_test_split(other_df, test_size=0.2, random_state=42)

other_sentences_tr = sentence_processor_df(other_sentences_tr)
other_sentences_te = sentence_processor_df(other_sentences_te)

In [10]:
vocabulary_all = other_sentences_tr + happy_sentences_tr

def getUniqueTokens(vocabulary_all):
    unique_vocabulary = []
    for tweet_tokens in vocabulary_all:
        for token in tweet_tokens:
            if token not in unique_vocabulary:
                unique_vocabulary.append(token)
    return unique_vocabulary

vocabulary = getUniqueTokens(vocabulary_all)
vocabulary.sort()

print(len(vocabulary))
print(vocabulary[:50])

17689
["''", "'1000", "'14", "'76", "'albino", "'all", "'alon", "'aloof", "'at", "'belli", "'between", "'big", "'blanket", "'bout", "'breast", "'brother", "'bug", "'build", "'bungl", "'buri", "'caus", "'cause-oh", "'cha", "'confid", "'cool", "'d", "'dear", "'despair", "'did", "'do", "'doe", "'down", "'eddi", "'em", "'emot", "'etern", "'excus", "'favourit", "'flaign", "'for", "'friend", "'galaxy-brain", "'gershwin", "'gift", "'good", "'good-by", "'ha", "'hate", "'have", "'heav"]


In [11]:
freqs = {}

for token in vocabulary:
    count_positive = sum(1 for tweet_tokens in happy_sentences_tr + happy_sentences_te if token in tweet_tokens)
    count_negative = sum(1 for tweet_tokens in other_sentences_tr + other_sentences_te if token in tweet_tokens)
    freqs[token] = [count_positive, count_negative]

In [12]:
df = pd.DataFrame.from_dict(freqs, orient='index', columns=['count(w_i, +)', 'count(w_i, -)'])
df.head(10)

Unnamed: 0,"count(w_i, +)","count(w_i, -)"
'',273,1283
'1000,1,1
'14,1,1
'76,1,1
'albino,0,2
'all,0,2
'alon,0,2
'aloof,0,2
'at,0,2
'belli,0,2


In [13]:
total_positive = df['count(w_i, +)'].sum()
total_negative = df['count(w_i, -)'].sum()

df['P(w_i|+)'] = df['count(w_i, +)'] / total_positive
df['P(w_i|-)'] = df['count(w_i, -)'] / total_negative

print(df.head())

         count(w_i, +)  count(w_i, -)  P(w_i|+)  P(w_i|-)
''                 273           1283  0.001861  0.002865
'1000                1              1  0.000007  0.000002
'14                  1              1  0.000007  0.000002
'76                  1              1  0.000007  0.000002
'albino              0              2  0.000000  0.000004


In [14]:
smooth_factor = len(vocabulary)

df['P(w_i|+) smooth'] = (df['count(w_i, +)'] + 1) / (total_positive + smooth_factor)
df['P(w_i|-) smooth'] = (df['count(w_i, -)'] + 1) / (total_negative + smooth_factor)

print(df.head())

         count(w_i, +)  count(w_i, -)  P(w_i|+)  P(w_i|-)  P(w_i|+) smooth  \
''                 273           1283  0.001861  0.002865         0.001667   
'1000                1              1  0.000007  0.000002         0.000012   
'14                  1              1  0.000007  0.000002         0.000012   
'76                  1              1  0.000007  0.000002         0.000012   
'albino              0              2  0.000000  0.000004         0.000006   

         P(w_i|-) smooth  
''              0.002758  
'1000           0.000004  
'14             0.000004  
'76             0.000004  
'albino         0.000006  


In [15]:
# YOUR CODE HERE #
Ndoc = len(vocabulary)


# df["P(+)"] = (df['count(w_i, +)'] + 1) / Ndoc 
# df["P(-)"] = (df['count(w_i, -)'] + 1) / Ndoc 
# print(df.head())

p_pos = len(happy_sentences_tr ) / Ndoc
p_neg = len(other_sentences_tr ) / Ndoc

print(p_pos,p_neg)

1.1079201763808015 3.266719430154333


In [16]:
tw = other_sentences_te[3]
print(tw)

['realli', 'worri', 'nisa', 'translat', 'point', "n't", 'care', 'long', 'get', 'english', 'version']


In [17]:

prob_pos = 1
prob_neg = 1

for token in tw:

    if token in df.index:
      
        prob_pos *= df.loc[token, 'P(w_i|+) smooth']
       
        prob_neg *= df.loc[token, 'P(w_i|-) smooth']

In [18]:
if prob_pos > prob_neg:
    print('Class positive')
else:
    print('Class negative')

Class negative


In [19]:
df['log(P(w_i|+) smooth)'] = np.log(df['P(w_i|+) smooth'])
df['log(P(w_i|-) smooth)'] = np.log(df['P(w_i|-) smooth'])

print(df.head())

         count(w_i, +)  count(w_i, -)  P(w_i|+)  P(w_i|-)  P(w_i|+) smooth  \
''                 273           1283  0.001861  0.002865         0.001667   
'1000                1              1  0.000007  0.000002         0.000012   
'14                  1              1  0.000007  0.000002         0.000012   
'76                  1              1  0.000007  0.000002         0.000012   
'albino              0              2  0.000000  0.000004         0.000006   

         P(w_i|-) smooth  log(P(w_i|+) smooth)  log(P(w_i|-) smooth)  
''              0.002758             -6.396674             -5.893123  
'1000           0.000004            -11.316655            -12.357712  
'14             0.000004            -11.316655            -12.357712  
'76             0.000004            -11.316655            -12.357712  
'albino         0.000006            -12.009802            -11.952246  


In [20]:
y_test = []
y_preds = []

# YOUR CODE HERE #

for sentence in happy_sentences_te + other_sentences_te:
    prob_pos = 1
    prob_neg = 1
    
    for token in sentence:
        if token in df.index:
            prob_pos *= df.loc[token, 'P(w_i|+)']
            prob_neg *= df.loc[token, 'P(w_i|-)']
        # else:
        #     prob_pos *= 1 / (total_positive + smooth_factor)
        #     prob_neg *= 1 / (total_negative + smooth_factor)
        
    if prob_pos > prob_neg:
        y_preds.append(1)  
    else:
        y_preds.append(0)  

# Assign true labels for the test set
y_test = [1] * len(happy_sentences_te) + [0] * len(other_sentences_te)
    
y_preds = np.array(y_preds)
y_test = np.array(y_test)

In [21]:
sum(y_preds == y_test)/len(y_test)

0.6801571303044399

In [22]:
kaggle_test = pd.read_csv(r"C:\Users\domin\Desktop\Year 2 Block C\2023-24c-fai2-adsai-DominikSzewczyk224180\Datasets\test.csv", sep = "\t")

In [23]:
def sentence_processor_df(df):
    processed_sentences_list = []
    for sentence in df['sentence']:
        processed_sentence = sentence_processor(sentence)
        processed_sentences_list.append(processed_sentence)
    return processed_sentences_list

In [24]:
kaggle_test =  sentence_processor_df(kaggle_test)

In [25]:
y_test = []
y_preds = []

# YOUR CODE HERE #

for sentence in kaggle_test:
    prob_pos = 1
    prob_neg = 1
    
    for token in sentence:
        if token in df.index:
            prob_pos *= df.loc[token, 'P(w_i|+)']
            prob_neg *= df.loc[token, 'P(w_i|-)']
        # else:
        #     prob_pos *= 1 / (total_positive + smooth_factor)
        #     prob_neg *= 1 / (total_negative + smooth_factor)
        
    if prob_pos > prob_neg:
        y_preds.append(1)  
    else:
        y_preds.append(0)  

    
y_preds = np.array(y_preds)


In [26]:
import csv

submission_data = [(i, 'other' if pred == 0 else 'happiness') for i, pred in enumerate(y_preds)]

with open(r'C:\Users\domin\Desktop\Year 2 Block C\2023-24c-fai2-adsai-DominikSzewczyk224180\Datasets\submission.csv', 'w', newline='') as csvfile:
    fieldnames = ['id', 'emotion']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for row in submission_data:
        writer.writerow({'id': row[0], 'emotion': row[1]})