In [None]:
import pandas as pd
import numpy as np
import string 

df = pd.read_csv('training.1600000.processed.noemoticon.csv', sep=',', encoding='latin-1',  header=None)
df.columns = ["target","ids","date","flag","user","text"]
df = df.drop(["ids","date","flag","user"], axis = 1)
df.head()


### Zaawansowane przetwarzanie i czyszczenie danych (2 pkt)
- Wykorzystanie technik takich jak lematyzacja, stemming, uwzględnienie negacji.
- Obsługa emotikonów, skrótów, błędów ortograficznych.

In [None]:
import spacy
from nltk.tokenize.casual import EMOTICON_RE
import re

nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "textcat"])

stop_words = pd.read_csv('stop_words.txt')
stop_words = stop_words.loc[:,'word'].tolist()
stopwords_simple = [word.replace("'", "") for word in stop_words]
stop_words = set(stop_words + stopwords_simple)

misspelled_words = pd.read_csv('misspelled_words.txt',sep='->')
misspelled_words = dict(zip(misspelled_words['miss'], misspelled_words['target']))

def replace_consecutive_chars(s):
    # Replace more than 3 consecutive '.' with exactly 3 of the same character
    s = re.sub(r'([\.])\1{2,}', r'\1\1\1', s)
    
    # Replace sequences of '!' and '?' with just one '!' or '?' respectively
    s = re.sub(r'[!?]+(?:\s*[!?]+)*', lambda m: m.group(0)[0], s)  # Replace with just the first character
    
    return s

def preprocess_text(text):
    # Filter out misspelled words
    text = replace_consecutive_chars(text)
    words = text.split()
    words = [misspelled_words[word.lower()] if word.lower() in misspelled_words else word for word in words]
    doc = nlp(' '.join(words))
    # Tokenize, lemmatize, and filter out punctuation, special characters, lowercase
    # Preserve emoticons
 
    # Feature engineering
    word_count = len([t for t in doc if not t.is_punct])
    avg_word_length = sum(len(t.text) for t in doc) / word_count if word_count else 0
    pos_counts = {pos: 0 for pos in ["NOUN", "VERB", "ADJ", "ADV", "PRON", "CCONJ", "ADP", "PROPN", "PUNCT"]}
    for token in doc:
        if token.pos_ in pos_counts:
            pos_counts[token.pos_] += 1

    words = [token.lemma_.lower() for token in doc if 
             (token.lemma_.isalpha() and token.lemma_.isascii() and not token.is_punct)
             or EMOTICON_RE.match(token.text) or token.text in ['!', '?', '...']]
    # Remove stopwords nicknames and links
    words = [word for word in words if word not in stop_words] 
    words = [word for word in words if not word.startswith(('@', 'http', 'https', 'www.'))]
    return {
        "tokens": words,
        **{f"freq_{pos.lower()}": pos_counts[pos] / word_count if word_count else 0 for pos in pos_counts},
        "avg_word_length": avg_word_length,
    }

In [None]:
import os.path
import swifter
fname = 'tokenized.gzip'
if os.path.isfile(fname):
    print("Loading saved file")
    df = pd.read_pickle('tokenized.gzip', compression='gzip')
else:
    print("Preprocessing text")
    features = pd.DataFrame(df['text'].swifter.apply(lambda x: preprocess_text(x)).tolist(), index=df.index)
    df.loc[df['target'] == 4, 'target'] = 1
    df = pd.concat([df,features],axis=1)
    # Save to file
    df.to_pickle('tokenized.gzip',
                compression='gzip') 
df.head()

In [None]:
from wordcloud import WordCloud
from matplotlib.pyplot import subplots
positive_words = [word for words in df.loc[df['target'] == 1, 'tokens'].values.tolist() for word in words]
negative_words = [word for words in df.loc[df['target'] == 0, 'tokens'].values.tolist() for word in words]
pos_wc = WordCloud(max_words=100, width=1600, height=800, collocations=False).generate(' '.join(positive_words))
neg_wc = WordCloud(max_words=100, width=1600, height=800, collocations=False).generate(' '.join(negative_words))

fig, ax = subplots(nrows=2, ncols=1)
ax[0].imshow(pos_wc)
ax[1].imshow(neg_wc)

### Inżynieria cech i reprezentacja tekstu (1,5 pkt)
- Wykorzystanie embeddingu słów GloVe.
- Tworzenie dodatkowych cech: analiza części mowy, średnia długośc słów (podczas przetwarzania danych)

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.3, random_state = 0)

Wektoryzacja 

In [None]:
import os

# Only the TensorFlow backend supports string inputs.
os.environ["KERAS_BACKEND"] = "tensorflow"

import pathlib
import numpy as np
import tensorflow.data as tf_data
import keras
from keras import layers
from keras.layers import Embedding

In [None]:
max_length = train_df['tokens'].apply(len).max().item()
print(max_length)
vectorizer = layers.TextVectorization(max_tokens=20000, output_sequence_length=max_length)
text_ds = tf_data.Dataset.from_tensor_slices(train_df['tokens'].apply(lambda x: ' '.join(x)).tolist()).batch(128)
vectorizer.adapt(text_ds)
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc,range(len(voc))))

In [None]:
# Glove embedding
# Download https://nlp.stanford.edu/data/glove.twitter.27B.zip 
# and extract glove.twitter.27B.100d.txt file
path_to_glove_file = "glove.twitter.27B.100d.txt"

embeddings_index = {}
with open(path_to_glove_file, encoding="utf8") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

In [None]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0
missed = []
# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        missed.append(word)
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

### Implementacja zaawansowanego modelu (2 pkt)
- Zastosowanie modeli takich jak sieci neuronowe (LSTM, CNN) lub transformatorów (BERT).
- Uzasadnienie wyboru modelu i opis architektury.

### Uzasadnienie architektury CNN

1. **Warstwa Embedding**: 
   - Służy do zamiany sekwencji indeksów słów na wektory osadzenia (embedding vectors), co pozwala na reprezentację semantyczną słów.
   - Wykorzystuje wstępnie wytrenowaną macierz osadzeń (`embedding_matrix`).

2. **Warstwy Conv1D**:
   - Wykrywają lokalne wzorce w sekwencjach (np. n-gramy).
   - Aktywacja `ReLU` wprowadza nieliniowość, umożliwiając modelowi naukę złożonych reprezentacji.

3. **MaxPooling1D**:
   - Redukuje rozmiar danych, zachowując najistotniejsze cechy, zapobiegając nadmiernemu dopasowaniu.

4. **GlobalMaxPooling1D**:
   - Wydobywa najważniejszą cechę z całej sekwencji, co jest przydatne w klasyfikacji tekstów.

5. **Warstwa Dense**:
   - Przekształca reprezentacje w bardziej złożone cechy.
   - `ReLU` wprowadza nieliniowość.

6. **Dropout**:
   - Zapobiega nadmiernemu dopasowaniu (overfitting), poprawiając generalizację modelu.

7. **Wyjście**:
   - Warstwa `sigmoid` służy do klasyfikacji binarnej, generując wynik w przedziale [0, 1].

8. **Kompilacja**:
   - Funkcja straty: `binary_crossentropy` (klasyfikacja binarna).
   - Optymalizator: `RMSprop`, często stosowany w zadaniach NLP.


### Optymalizacja i tuning hiperparametrów (1 pkt)
- Przeprowadzenie tuningu hiperparametrów Grid Search.
- Uzasadnienie wyboru optymalnych parametrów.

In [None]:
def getX(df):
    df2 = df.iloc[:,3:]
    vect_data = vectorizer(
        np.array(df["tokens"].apply(lambda x: [" ".join(x)]).tolist())
    ).numpy()
    feat_data = np.zeros(vect_data.shape)
    feat_data[:, : df2.shape[1]] = df2.values
    stacked = np.stack([vect_data, feat_data], axis=2)
    return stacked


x_train = getX(train_df)
x_test = getX(test_df)

y_train = train_df["target"].to_numpy()
y_test = test_df["target"].to_numpy()

In [None]:
import os
import keras
from keras import layers
from keras.models import Model
from keras.layers import Embedding
from sklearn.model_selection import RandomizedSearchCV
from scikeras.wrappers import KerasClassifier
import numpy as np
import tensorflow as tf

# Custom function to merge inputs
def prepare_inputs(x_embedding, x_features):
    return [np.array(x_embedding), np.array(x_features)]

from scikeras.wrappers import KerasClassifier
from keras.saving import register_keras_serializable

input_shape = (x_train.shape[1],x_train.shape[2])
additional_features_size = df.iloc[:,3:].shape[1]

@register_keras_serializable()
def extract_vector_input(x):
    return x[:,:,0]
@register_keras_serializable()
def extract_feature_input(x):
    return x[:,:additional_features_size,1]

# Definicja funkcji do stworzenia modelu
def create_model(filters, kernel_size, dense_units, dropout_rate,features_dense_units):
    embedding_layer = Embedding(
        num_tokens,
        embedding_dim,
        trainable=False,
    )
    embedding_layer.build((1,))
    embedding_layer.set_weights([embedding_matrix])

    # Combined input
    combi_input = layers.Input(input_shape) 
    vectors = layers.Lambda(extract_vector_input, name="lambda_1")(combi_input) 
    embedded_sequences = embedding_layer(vectors)
    x = layers.Conv1D(filters, kernel_size, activation="relu")(embedded_sequences)
    x = layers.MaxPooling1D(2)(x)
    x = layers.Conv1D(filters, kernel_size, activation="relu")(x)
    x = layers.MaxPooling1D(2)(x)
    x = layers.Conv1D(filters, kernel_size, activation="relu")(x)
    x = layers.GlobalMaxPooling1D()(x)
        # Additional feature input
    additional_features_input = layers.Lambda(extract_feature_input, name="lambda_2")(combi_input)
    normalization = layers.Normalization()(additional_features_input)
    y = layers.Dense(features_dense_units, activation="relu")(normalization)
    y = layers.Dense(1, activation="sigmoid")(y)

    # Combine embedding and additional features
    combined = layers.concatenate([x, y])
    z = layers.Dense(dense_units, activation="relu")(combined)
    z = layers.Dropout(dropout_rate)(z)
    preds = layers.Dense(1, activation="sigmoid")(z)

    model = Model(combi_input, preds)
    model.compile(
        loss="binary_crossentropy", optimizer='rmsprop', metrics=["acc"]
    )
    return model

In [None]:
# Logika wczytywania lub tworzenia nowego modelu
model_path = "cnn_model_optimized.keras"
if os.path.isfile(model_path):
    model = keras.models.load_model(model_path)
else:
    # Przygotowanie KerasClassifier do użycia z Random Search
    model = KerasClassifier(
        model=create_model,
        filters=64,
        kernel_size=3,
        dense_units=128,
        dropout_rate=0.5,
        verbose=0,
        batch_size=64,
        features_dense_units=16,
    )

    # Zakres hiperparametrów do przeszukania
    param_distributions = {
        "filters": [32, 64, 128],
        "kernel_size": [2,3,4],
        "dense_units": [32, 64, 128, 256],
        "dropout_rate": [0.3, 0.5, 0.7],
        "batch_size": [32, 64],
        "epochs": [3],
        "features_dense_units":[16,32,64]
    }

    # Przeprowadzenie Random Search
    search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_distributions,
        n_jobs=-1,
        n_iter=2,
        cv=3,
        verbose=3,
        random_state=42,
    )

    # Dopasowanie modelu do danych
    search_result = search.fit(x_train, y_train)

    best_params = search_result.best_params_
    print("Najlepsze parametry:", best_params)
    print("Najlepszy wynik:", search_result.best_score_)
    print(f"Najlepsze parametry: {best_params}.")

    model = KerasClassifier(
        model=create_model,
        **best_params,
        verbose=1,
    )

    model.fit(x_train, y_train)
    model.model_.save(model_path)  # `model_.save` for scikeras wrapper
if getattr(model, "model_", None) is not None:
    model.model_.summary()
else:
    model.summary()

### Porównanie z prostym modelem (0,5 pkt)
- Analiza różnic w wynikach między modelami.
- Wnioski dotyczące wpływu zaawansowanych technik.

Model CNN

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report 
from sklearn.metrics import roc_auc_score
    
vectorizer
y_pred = model.predict(x_test)
roc_auc = roc_auc_score(y_test, y_pred)
y_pred[y_pred >= 0.5] = 1
y_pred[y_pred < 0.5] = 0
print("Dokładność modelu:", accuracy_score(y_test, y_pred))
print("Raport klasyfikacji:\n", classification_report(y_test, y_pred))
print(f"ROC_AUC {roc_auc}")

Prosty model regresji logistycznej

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

def dummy_fun(doc):
    return doc

tfidf = TfidfVectorizer(
    ngram_range=(1,2),
    max_features=10000,
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None)  

X_train_linear = tfidf.fit_transform(train_df['tokens'])
X_test_linear = tfidf.transform(test_df['tokens'])
Y_train_linear = train_df['target']
Y_test_linear = test_df['target']


clf = LogisticRegression(random_state=0, max_iter = 10000, n_jobs=-1).fit(X_train_linear, Y_train_linear)
clf.score(X_train_linear, Y_train_linear)

In [None]:
y_pred = clf.predict(X_test_linear)
print("Dokładność modelu:", accuracy_score(Y_test_linear, y_pred))
print("Raport klasyfikacji:\n", classification_report(Y_test_linear, y_pred))
roc_auc = roc_auc_score(Y_test_linear, clf.predict_proba(X_test_linear)[:, 1])
print(f"ROC_AUC {roc_auc}")