In [1]:
import pandas as pd
import re
import string
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.metrics import f1_score

In [2]:
simplified_emotions = pd.read_csv(r"C:\Users\domin\Desktop\Year 2 Block C\2023-24c-fai2-adsai-DominikSzewczyk224180\Datasets\simplified_emotions.csv")

In [3]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(simplified_emotions['sentence'])
y = simplified_emotions['emotion']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
test_f1_score = f1_score(y_test, y_pred, average='macro')
print("Test f1 score:", test_f1_score)

Accuracy: 0.693838519590613
Test f1 score: 0.4844570930963803


In [4]:


def sentence_processor(sentence):
    sentence = re.sub(r'http\S+', '', sentence)
    sentence = sentence.replace('#', '')
    tokenizer = word_tokenize
    processed_sentence = tokenizer(sentence)
    stopwords_english = set(stopwords.words('english'))
    processed_sentence = [word for word in processed_sentence if word.lower() not in stopwords_english]
    processed_sentence = [word for word in processed_sentence if word.lower() not in string.punctuation]
    stemmer = PorterStemmer()
    processed_sentence = [stemmer.stem(word) for word in processed_sentence]
    return processed_sentence

def sentence_processor_df(df):
    processed_sentences_list = []
    for sentence in df['sentence']:
        processed_sentence = sentence_processor(sentence)
        processed_sentences_list.append(processed_sentence)
    return processed_sentences_list


processed_sentences = sentence_processor_df(simplified_emotions)
processed_sentences_str = [" ".join(sentence) for sentence in processed_sentences]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(processed_sentences_str)
y = simplified_emotions['emotion']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
test_f1_score = f1_score(y_test, y_pred, average='macro')
print("Test f1 score:", test_f1_score)


Accuracy: 0.7056239015817223
Test f1 score: 0.4884640867639952


In [5]:

processed_sentences = sentence_processor_df(simplified_emotions)
processed_sentences_str = [" ".join(sentence) for sentence in processed_sentences]

vectorizer = CountVectorizer()
X_text = vectorizer.fit_transform(processed_sentences_str)
y = simplified_emotions['emotion']

sia = SentimentIntensityAnalyzer()
sentiment_features = []
for sentence_tokens in processed_sentences:
    scores = sia.polarity_scores(' '.join(sentence_tokens))
    sentiment_features.append([scores['pos'], scores['neg'], scores['compound']])

X_sentiment = np.array(sentiment_features)

X_combined = np.concatenate((X_text.toarray(), X_sentiment), axis=1)

X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
test_f1_score = f1_score(y_test, y_pred, average='macro')
print("Test f1 score:", test_f1_score)


Accuracy: 0.7044867156001241
Test f1 score: 0.4959836372551889


In [6]:
kaggle_test = pd.read_csv(r"C:\Users\domin\Desktop\Year 2 Block C\2023-24c-fai2-adsai-DominikSzewczyk224180\Datasets\test.csv", sep = "\t")

In [7]:
kaggle_test

Unnamed: 0,id,sentence
0,0,I get sad when I see a sick animal.
1,1,Getting punched in the face like that makes my...
2,2,I get anxious near the end of the block.
3,3,being awoken to a shadowy figure at the edge o...
4,4,I am annoyed that my brother didn't wash his d...
...,...,...
1431,1431,We are so glad you found a new job.
1432,1432,Lisa's heart swelled with joy as her dog eager...
1433,1433,The view is so amazing that I don't know what ...
1434,1434,The scent of fresh flowers always brings her joy.


In [8]:

kaggle_test = pd.read_csv(r"C:\Users\domin\Desktop\Year 2 Block C\2023-24c-fai2-adsai-DominikSzewczyk224180\Datasets\test.csv", sep="\t")


def sentence_processor(sentence):
    sentence = re.sub(r'http\S+', '', sentence)
    sentence = sentence.replace('#', '')
    tokenizer = word_tokenize
    processed_sentence = tokenizer(sentence)
    stopwords_english = set(stopwords.words('english'))
    processed_sentence = [word for word in processed_sentence if word.lower() not in stopwords_english]
    processed_sentence = [word for word in processed_sentence if word.lower() not in string.punctuation]
    stemmer = PorterStemmer()
    processed_sentence = [stemmer.stem(word) for word in processed_sentence]
    return processed_sentence

# Sentiment analysis
def calculate_sentiment_features(sentences):
    sia = SentimentIntensityAnalyzer()
    sentiment_features = []
    for sentence in sentences:
        scores = sia.polarity_scores(sentence)
        sentiment_features.append([scores['pos'], scores['neg'], scores['compound']])
    return np.array(sentiment_features)

# Preprocessing Kaggle test sentences
processed_sentences = [sentence_processor(sentence) for sentence in kaggle_test['sentence']]
sentiment_features = calculate_sentiment_features(kaggle_test['sentence'])


processed_sentences_str = [" ".join(sentence) for sentence in processed_sentences]
X_text = vectorizer.transform(processed_sentences_str)
X_sentiment = np.array(sentiment_features)
X_combined = np.concatenate((X_text.toarray(), X_sentiment), axis=1)

predictions = model.predict(X_combined)




In [9]:
output_df = pd.DataFrame({'id': kaggle_test['id'], 'emotion': predictions})

output_df.to_csv("kaggle_test_predictions.csv", index=False)