In [1]:
import numpy as np
import networkx as nx
import scipy.sparse as sp
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import spacy
import pandas as pd

df = pd.read_csv('./twitter_training.csv')

df.info()

# Load the Spacy NLP model for English
nlp = spacy.load('en_core_web_sm')

import re

remove_punct_num = lambda x: re.sub(r'[^\w\s]|[\d]+', '', x)

corpus=df['im getting on borderlands and i will murder you all ,'].astype(str)

corpus_cleaned = corpus.apply(remove_punct_num)

type(corpus_cleaned)

sentiment=df['Positive']

sentiment

X_train, X_test, y_train, y_test = train_test_split(corpus_cleaned, sentiment, test_size=0.1, random_state=42)

len(X_train)==len(y_train)

def preprocess(text):
    # Tokenize the text
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

X_train = [preprocess(text) for text in X_train]
X_test = [preprocess(text) for text in X_test]

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
model.fit(X_train_tfidf, y_train)


y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))

2023-02-19 14:51:55.159990: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# With TF?

In [67]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pandas as pd
import spacy
import re


df = pd.read_csv('./twitter_training.csv')


remove_punct_num = lambda x: re.sub(r'[^\w\s]|[\d]+', '', x)
corpus = df['im getting on borderlands and i will murder you all ,'].astype(str)
corpus_cleaned = corpus.apply(remove_punct_num)

nlp = spacy.load('en_core_web_sm')

def preprocess(text):
    # Tokenize the text
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

corpus_cleaned = [preprocess(text) for text in corpus_cleaned]

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(corpus_cleaned)
X = tokenizer.texts_to_sequences(corpus_cleaned)
X = pad_sequences(X, maxlen=100)






In [75]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Positive'] = le.fit_transform(df['Positive'])

In [76]:
y = df['Positive']

In [92]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train = np.expand_dims(X_train, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)

X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

In [93]:
y_train[y_train == 3] = 2
y_test[y_test == 3] = 2

In [94]:

model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(5000, 64, input_length=100),
    tf.keras.layers.Conv1D(64, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=4),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(3, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])




In [95]:
import numpy as np
print(np.unique(y_train))

[0 1 2]


In [96]:
# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fc9f03e0df0>

In [98]:
y_pred = model.predict(X_test)



In [101]:
category_predictions = np.argmax(y_pred, axis=1)

In [102]:
category_predictions

array([0, 2, 1, ..., 1, 2, 0])

In [103]:

print("Accuracy:", accuracy_score(y_test, category_predictions))
print("Precision:", precision_score(y_test, category_predictions, average='weighted'))
print("Recall:", recall_score(y_test, category_predictions, average='weighted'))
print("F1 Score:", f1_score(y_test, category_predictions, average='weighted'))


Accuracy: 0.8570089704110323
Precision: 0.8571459325082399
Recall: 0.8570089704110323
F1 Score: 0.8561287280150716


# This is how a untuned model significantly increases our accuracy on unseen data! 