In [12]:
import re
import nltk
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score
from sklearn.decomposition import PCA
from scipy.sparse import vstack
from scipy.optimize import linear_sum_assignment
from nltk.corpus import stopwords

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Define text cleaning function
def clean(text):
    text = re.sub(r'https?://\S+|www\.\S+', '', str(text))  # Remove URLs
    text = re.sub(r"<br />", " ", text)
    text = re.sub(r"&quot;", "\"", text)
    text = re.sub('&#39;', "\"", text)
    text = re.sub('\n', " ", text)
    text = re.sub(' u ', " you ", text)
    text = re.sub('`', "", text)
    text = re.sub(r"(!)\1+", r"!", text)
    text = re.sub(r"(\?)\1+", r"?", text)
    text = re.sub('&amp;', 'and', text)
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Substitute multiple spaces with a single space
    text = text.lower().strip()  # Convert to lowercase and strip whitespace
    return text

# Load and clean data
train_data = pd.read_csv('train.csv')
val_data = pd.read_csv('val.csv')
train_data['Phrase'] = train_data['Phrase'].astype(str).apply(clean)
val_data['Phrase'] = val_data['Phrase'].astype(str).apply(clean)

# Define stop words for Bag of Words
stop_words = list(stopwords.words('english'))

# Vectorization function with Bag of Words (BoW)
def phrase_vectorize(train_data):
    vectorizer = CountVectorizer(binary=True, min_df=5, stop_words=stop_words)  # Bag of Words (BoW)
    vectorizer.fit(train_data['Phrase'])
    return vectorizer

# Unsupervised learning with K-Means
def unsup_learn(K, X_train_unlabeled, X_train_labeled, y_train_labeled):
    pca = PCA(n_components=200, random_state=42)
    X_train_unlabeled_reduced = pca.fit_transform(X_train_unlabeled)
    X_train_labeled_reduced = pca.transform(X_train_labeled)

    learn_model = KMeans(n_clusters=K, random_state=42)
    pseudo_labels = learn_model.fit_predict(X_train_unlabeled_reduced)

    # Match clusters to labels
    cost_matrix = np.zeros((K, K))
    for cluster_id in range(K):
        labeled_in_cluster = y_train_labeled[learn_model.predict(X_train_labeled_reduced) == cluster_id]
        for label in range(K):
            cost_matrix[cluster_id, label] = np.sum(labeled_in_cluster != label)

    row_ind, col_ind = linear_sum_assignment(cost_matrix)
    cluster_to_label_map = {row: col for row, col in zip(row_ind, col_ind)}
    pseudo_labels_remapped = np.array([cluster_to_label_map[cluster] for cluster in pseudo_labels])

    X_combined = vstack([X_train_labeled, X_train_unlabeled])
    y_combined = np.concatenate((y_train_labeled, pseudo_labels_remapped))
    return X_combined, y_combined

# Supervised learning with LogisticRegression for L1 and L2 regularization
def sup_learn(X_combined, y_combined, X_val, y_val, regularization):
    if regularization == "L1":
        learn_model = LogisticRegression(multi_class='multinomial', solver='saga', max_iter=1000, penalty='l1')  # L1 regularization
    elif regularization == "L2":
        learn_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, penalty='l2')  # L2 regularization
    else:
        raise ValueError("Invalid regularization method. Use 'L1' or 'L2'.")

    learn_model.fit(X_combined, y_combined)
    y_pred_val = learn_model.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_pred_val)
    f1 = f1_score(y_val, y_pred_val, average="macro")
    return val_accuracy, f1

# Main training function for testing L1 and L2 regularization only
def final_training():
    my_vectorizer = phrase_vectorize(train_data)
    X_train_labeled = my_vectorizer.transform(labeled_train['Phrase']).toarray()
    X_val_labeled = my_vectorizer.transform(labeled_val['Phrase']).toarray()
    X_unlabeled = my_vectorizer.transform(unlabeled_train['Phrase']).toarray()

    y_train_labeled = labeled_train['Sentiment']
    y_val = labeled_val['Sentiment']
    K = len(np.unique(y_train_labeled))
    X_combined, y_combined = unsup_learn(K, X_unlabeled, X_train_labeled, y_train_labeled)

    # Only iterate over L1 and L2 regularization methods
    for reg in ["L1", "L2"]:
        print(f"Regularization: {reg}")
        val_accuracy, f1 = sup_learn(X_combined, y_combined, X_val_labeled, y_val, reg)
        print(f"Validation Accuracy: {val_accuracy:.4f}")
        print(f"F1 Score on Validation Set: {f1:.4f}\n")

# Separate labeled and unlabeled data
labeled_train = train_data[train_data['Sentiment'] != -100]
unlabeled_train = train_data[train_data['Sentiment'] == -100]
labeled_val = val_data[val_data['Sentiment'] != -100]

# Run final training
final_training()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Regularization: L1




Validation Accuracy: 0.6969
F1 Score on Validation Set: 0.6983

Regularization: L2




Validation Accuracy: 0.7041
F1 Score on Validation Set: 0.7052



In [5]:
def clean_dataset(dataset):
    """
    Apply the clean function to the first column of a pandas DataFrame.
    """
    # Assuming dataset is a pandas DataFrame and 'Phrase' is the text column
    dataset['Phrase'] = dataset['Phrase'].apply(clean)
    return dataset

def tokenize_lexicon(texts):
    """
    Tokenize and POS tag each text in the list.
    """
    return [nltk.pos_tag(nltk.word_tokenize(text)) for text in texts]

def get_wordnet_pos(pos_tag):
    """
    Map NLTK POS tags to WordNet POS tags.
    """
    return TAG_DICT.get(pos_tag[0], wn.NOUN)

def lemmatize_texts(texts):
    """
    Lemmatize each word in the tokenized and POS-tagged texts.
    """
    return [
        [lemmer.lemmatize(word, pos=get_wordnet_pos(pos_tag)) for word, pos_tag in text]
        for text in texts
    ]

def stem_texts(texts):
    """
    Stem each word in the tokenized texts.
    """
    return [
        [stemmer.stem(word) for word, _ in text]
        for text in texts
    ]

def backtostring(texts):
    """
    Convert lists of tokens back into strings.
    """
    return [" ".join(text) for text in texts]

In [13]:
import re
import nltk
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, accuracy_score
from scipy.sparse import vstack
from scipy.optimize import linear_sum_assignment
from nltk.corpus import stopwords, wordnet as wn
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Initialize stemmer and lemmatizer
lemmer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Tag dictionary for POS mapping
TAG_DICT = {
    'J': wn.ADJ,
    'V': wn.VERB,
    'N': wn.NOUN,
    'R': wn.ADV
}

# Define cleaning and preprocessing functions
def clean(text):
    text = re.sub(r'https?://\S+|www\.\S+', '', str(text))  # Remove URLs
    text = re.sub(r"<br />", " ", text)
    text = re.sub(r"&quot;", "\"", text)
    text = re.sub('&#39;', "\"", text)
    text = re.sub('\n', " ", text)
    text = re.sub(' u ', " you ", text)
    text = re.sub('`', "", text)
    text = re.sub(r"(!)\1+", r"!", text)
    text = re.sub(r"(\?)\1+", r"?", text)
    text = re.sub('&amp;', 'and', text)
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Substitute multiple spaces with a single space
    text = text.lower().strip()  # Convert to lowercase and strip whitespace
    return text

def clean_dataset(dataset):
    dataset['Phrase'] = dataset['Phrase'].apply(clean)
    return dataset

def tokenize_lexicon(texts):
    return [nltk.pos_tag(nltk.word_tokenize(text)) for text in texts]

def get_wordnet_pos(pos_tag):
    return TAG_DICT.get(pos_tag[0], wn.NOUN)

def lemmatize_texts(texts):
    return [
        [lemmer.lemmatize(word, pos=get_wordnet_pos(pos_tag)) for word, pos_tag in text]
        for text in texts
    ]

def stem_texts(texts):
    """
    Stem each word in the tokenized texts.
    """
    return [
        [stemmer.stem(word) for word in text]  # Stem each word without needing POS tags
        for text in texts
    ]

def backtostring(texts):
    return [" ".join(text) for text in texts]

# Load and preprocess data
train_data = pd.read_csv('train.csv')
val_data = pd.read_csv('val.csv')

# Apply cleaning, tokenization, lemmatization, stemming, and conversion back to strings
train_data = clean_dataset(train_data)
val_data = clean_dataset(val_data)
"""
# Tokenization and POS tagging
train_data['tokens'] = tokenize_lexicon(train_data['Phrase'])
val_data['tokens'] = tokenize_lexicon(val_data['Phrase'])

# Lemmatization
train_data['tokens'] = lemmatize_texts(train_data['tokens'])
val_data['tokens'] = lemmatize_texts(val_data['tokens'])


# Convert tokens back to strings
train_data['Phrase'] = backtostring(train_data['tokens'])
val_data['Phrase'] = backtostring(val_data['tokens'])
"""
# Define stop words for TF-IDF
stop_words = list(stopwords.words('english'))

# Vectorization function with TF-IDF unigram
def phrase_vectorize(train_data):
    vectorizer = TfidfVectorizer(binary=True, min_df=3, stop_words=stop_words)  # TF-IDF with unigrams
    vectorizer.fit(train_data['Phrase'])
    return vectorizer

# Unsupervised learning with Gaussian Mixture Model (GMM)
def unsup_learn(K, X_train_unlabeled, X_train_labeled, y_train_labeled):
    pca = PCA(n_components=200, random_state=42)
    X_train_unlabeled_reduced = pca.fit_transform(X_train_unlabeled)
    X_train_labeled_reduced = pca.transform(X_train_labeled)

    learn_model = GaussianMixture(n_components=K, covariance_type='diag', random_state=42, max_iter=50, tol=1e-3)
    pseudo_labels = learn_model.fit_predict(X_train_unlabeled_reduced)

    # Match clusters to labels
    cost_matrix = np.zeros((K, K))
    for cluster_id in range(K):
        labeled_in_cluster = y_train_labeled[learn_model.predict(X_train_labeled_reduced) == cluster_id]
        for label in range(K):
            cost_matrix[cluster_id, label] = np.sum(labeled_in_cluster != label)

    row_ind, col_ind = linear_sum_assignment(cost_matrix)
    cluster_to_label_map = {row: col for row, col in zip(row_ind, col_ind)}
    pseudo_labels_remapped = np.array([cluster_to_label_map[cluster] for cluster in pseudo_labels])

    X_combined = vstack([X_train_labeled, X_train_unlabeled])
    y_combined = np.concatenate((y_train_labeled, pseudo_labels_remapped))
    return X_combined, y_combined

# Supervised learning with MultinomialNB
def sup_learn(X_combined, y_combined, X_val, y_val):
    learn_model = MultinomialNB(alpha=1)  # Multinomial Naive Bayes with smoothing
    learn_model.fit(X_combined, y_combined)
    y_pred_val = learn_model.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_pred_val)
    f1 = f1_score(y_val, y_pred_val, average="macro")
    return val_accuracy, f1

# Main training function
def final_training():
    my_vectorizer = phrase_vectorize(train_data)
    X_train_labeled = my_vectorizer.transform(labeled_train['Phrase']).toarray()
    X_val_labeled = my_vectorizer.transform(labeled_val['Phrase']).toarray()
    X_unlabeled = my_vectorizer.transform(unlabeled_train['Phrase']).toarray()

    y_train_labeled = labeled_train['Sentiment']
    y_val = labeled_val['Sentiment']
    K = len(np.unique(y_train_labeled))
    X_combined, y_combined = unsup_learn(K, X_unlabeled, X_train_labeled, y_train_labeled)

    # Evaluate with MultinomialNB
    print("Using MultinomialNB:")
    val_accuracy, f1 = sup_learn(X_combined, y_combined, X_val_labeled, y_val)
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print(f"F1 Score on Validation Set: {f1:.4f}")

# Separate labeled and unlabeled data
labeled_train = train_data[train_data['Sentiment'] != -100]
unlabeled_train = train_data[train_data['Sentiment'] == -100]
labeled_val = val_data[val_data['Sentiment'] != -100]

# Run final training
final_training()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Using MultinomialNB:
Validation Accuracy: 0.7891
F1 Score on Validation Set: 0.7877


In [21]:
import re
import nltk
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, accuracy_score
from scipy.sparse import vstack
from scipy.optimize import linear_sum_assignment
from nltk.corpus import stopwords
import string

# Download necessary NLTK data
nltk.download('stopwords')

# Define stop words for preprocessing
stop_words = set(stopwords.words('english'))
trans = str.maketrans('', '', string.punctuation)

# Define the cleaning and preprocessing function
def clean(text):
    # Ensure text is a string, if not return an empty string
    if not isinstance(text, str):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove @ mentions
    text = re.sub(r'@\w+', '', text)

    # Remove URLs
    text = re.sub(r'http\S+|www\.\S+', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove punctuation
    text = text.translate(trans)

    # Remove stop words
    text = ' '.join([word for word in text.split() if word not in stop_words])

    # Strip extra whitespace
    text = text.strip()

    return text

def clean_dataset(dataset):
    dataset['Phrase'] = dataset['Phrase'].apply(clean)
    return dataset

# Load and preprocess data
train_data = pd.read_csv('train.csv')
val_data = pd.read_csv('val.csv')

# Apply cleaning function to train and validation datasets
train_data = clean_dataset(train_data)
val_data = clean_dataset(val_data)

# Vectorization function with TF-IDF unigram
# Vectorization function with TF-IDF unigram
def phrase_vectorize(train_data):
    vectorizer = TfidfVectorizer(binary=True, min_df=5, stop_words=list(stop_words))  # Convert stop_words to a list
    vectorizer.fit(train_data['Phrase'])
    return vectorizer


# Unsupervised learning with Gaussian Mixture Model (GMM)
def unsup_learn(K, X_train_unlabeled, X_train_labeled, y_train_labeled):
    pca = PCA(n_components=50, random_state=42)
    X_train_unlabeled_reduced = pca.fit_transform(X_train_unlabeled)
    X_train_labeled_reduced = pca.transform(X_train_labeled)

    learn_model = GaussianMixture(n_components=K, covariance_type='full', random_state=42, max_iter=50, tol=1e-3)
    pseudo_labels = learn_model.fit_predict(X_train_unlabeled_reduced)

    # Match clusters to labels
    cost_matrix = np.zeros((K, K))
    for cluster_id in range(K):
        labeled_in_cluster = y_train_labeled[learn_model.predict(X_train_labeled_reduced) == cluster_id]
        for label in range(K):
            cost_matrix[cluster_id, label] = np.sum(labeled_in_cluster != label)

    row_ind, col_ind = linear_sum_assignment(cost_matrix)
    cluster_to_label_map = {row: col for row, col in zip(row_ind, col_ind)}
    pseudo_labels_remapped = np.array([cluster_to_label_map[cluster] for cluster in pseudo_labels])

    X_combined = vstack([X_train_labeled, X_train_unlabeled])
    y_combined = np.concatenate((y_train_labeled, pseudo_labels_remapped))
    return X_combined, y_combined

# Supervised learning with MultinomialNB
def sup_learn(X_combined, y_combined, X_val, y_val):
    learn_model = MultinomialNB(alpha=0.5)  # Multinomial Naive Bayes with smoothing
    learn_model.fit(X_combined, y_combined)
    y_pred_val = learn_model.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_pred_val)
    f1 = f1_score(y_val, y_pred_val, average="macro")
    return val_accuracy, f1

# Main training function
def final_training():
    my_vectorizer = phrase_vectorize(train_data)
    X_train_labeled = my_vectorizer.transform(labeled_train['Phrase']).toarray()
    X_val_labeled = my_vectorizer.transform(labeled_val['Phrase']).toarray()
    X_unlabeled = my_vectorizer.transform(unlabeled_train['Phrase']).toarray()

    y_train_labeled = labeled_train['Sentiment']
    y_val = labeled_val['Sentiment']
    K = len(np.unique(y_train_labeled))
    X_combined, y_combined = unsup_learn(K, X_unlabeled, X_train_labeled, y_train_labeled)

    # Evaluate with MultinomialNB
    print("Using MultinomialNB:")
    val_accuracy, f1 = sup_learn(X_combined, y_combined, X_val_labeled, y_val)
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print(f"F1 Score on Validation Set: {f1:.4f}")

# Separate labeled and unlabeled data
labeled_train = train_data[train_data['Sentiment'] != -100]
unlabeled_train = train_data[train_data['Sentiment'] == -100]
labeled_val = val_data[val_data['Sentiment'] != -100]

# Run final training
final_training()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Using MultinomialNB:
Validation Accuracy: 0.8467
F1 Score on Validation Set: 0.8484


In [22]:
import re
import nltk
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import ParameterGrid
from scipy.sparse import vstack
from scipy.optimize import linear_sum_assignment
from nltk.corpus import stopwords
import string

# Download necessary NLTK data
nltk.download('stopwords')

# Define stop words for preprocessing
stop_words = set(stopwords.words('english'))
trans = str.maketrans('', '', string.punctuation)

# Define the cleaning and preprocessing function
def clean(text):
    if not isinstance(text, str):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove @ mentions, URLs, numbers, and punctuation
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'http\S+|www\.\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.translate(trans)

    # Remove stop words and extra whitespace
    text = ' '.join([word for word in text.split() if word not in stop_words]).strip()

    return text

def clean_dataset(dataset):
    dataset['Phrase'] = dataset['Phrase'].apply(clean)
    return dataset

# Load and preprocess data
train_data = pd.read_csv('train.csv')
val_data = pd.read_csv('val.csv')

train_data = clean_dataset(train_data)
val_data = clean_dataset(val_data)

# Vectorization function with TF-IDF unigram
def phrase_vectorize(train_data, min_df):
    vectorizer = TfidfVectorizer(binary=True, min_df=min_df, stop_words=list(stop_words))
    vectorizer.fit(train_data['Phrase'])
    return vectorizer

# Unsupervised learning with Gaussian Mixture Model (GMM)
def unsup_learn(K, X_train_unlabeled, X_train_labeled, y_train_labeled, n_components=100, covariance_type='diag'):
    pca = PCA(n_components=n_components, random_state=42)
    X_train_unlabeled_reduced = pca.fit_transform(X_train_unlabeled)
    X_train_labeled_reduced = pca.transform(X_train_labeled)

    learn_model = GaussianMixture(n_components=K, covariance_type=covariance_type, random_state=42)
    pseudo_labels = learn_model.fit_predict(X_train_unlabeled_reduced)

    # Match clusters to labels
    cost_matrix = np.zeros((K, K))
    for cluster_id in range(K):
        labeled_in_cluster = y_train_labeled[learn_model.predict(X_train_labeled_reduced) == cluster_id]
        for label in range(K):
            cost_matrix[cluster_id, label] = np.sum(labeled_in_cluster != label)

    row_ind, col_ind = linear_sum_assignment(cost_matrix)
    cluster_to_label_map = {row: col for row, col in zip(row_ind, col_ind)}
    pseudo_labels_remapped = np.array([cluster_to_label_map[cluster] for cluster in pseudo_labels])

    X_combined = vstack([X_train_labeled, X_train_unlabeled])
    y_combined = np.concatenate((y_train_labeled, pseudo_labels_remapped))
    return X_combined, y_combined

# Supervised learning with MultinomialNB
def sup_learn(X_combined, y_combined, X_val, y_val, alpha):
    learn_model = MultinomialNB(alpha=alpha)
    learn_model.fit(X_combined, y_combined)
    y_pred_val = learn_model.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_pred_val)
    f1 = f1_score(y_val, y_pred_val, average="macro")
    return val_accuracy, f1

# Grid search for best parameters
def grid_search(train_data, val_data):
    # Set up the parameter grid
    param_grid = {
        'alpha': [0.5],
        'n_components': [50, 100, 150,200],
        'covariance_type': ['full'],
        'min_df': [1, 2,3,4, 5,6,7]
    }
    grid = ParameterGrid(param_grid)

    best_f1 = 0
    best_params = None

    X_val_labeled = None  # Initialize variable outside the loop for reuse

    # Loop over each parameter combination
    for params in grid:
        print(f"Testing parameters: {params}")

        # Vectorize with the current min_df
        my_vectorizer = phrase_vectorize(train_data, min_df=params['min_df'])

        # Transform train, val, and unlabeled data
        X_train_labeled = my_vectorizer.transform(labeled_train['Phrase']).toarray()
        X_val_labeled = my_vectorizer.transform(labeled_val['Phrase']).toarray()
        X_unlabeled = my_vectorizer.transform(unlabeled_train['Phrase']).toarray()

        y_train_labeled = labeled_train['Sentiment']
        y_val = labeled_val['Sentiment']
        K = len(np.unique(y_train_labeled))

        # Unsupervised learning with current params
        X_combined, y_combined = unsup_learn(K, X_unlabeled, X_train_labeled, y_train_labeled,
                                             n_components=params['n_components'],
                                             covariance_type=params['covariance_type'])

        # Supervised learning with current params
        val_accuracy, f1 = sup_learn(X_combined, y_combined, X_val_labeled, y_val, alpha=params['alpha'])

        # Check if this is the best so far
        if f1 > best_f1:
            best_f1 = f1
            best_params = params
            print(f"New best F1 score: {f1:.4f} with params {best_params}")

    print(f"Best F1 Score: {best_f1:.4f} with parameters: {best_params}")

# Separate labeled and unlabeled data
labeled_train = train_data[train_data['Sentiment'] != -100]
unlabeled_train = train_data[train_data['Sentiment'] == -100]
labeled_val = val_data[val_data['Sentiment'] != -100]

# Run grid search to find the best parameters
grid_search(train_data, val_data)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Testing parameters: {'alpha': 0.5, 'covariance_type': 'full', 'min_df': 1, 'n_components': 50}
New best F1 score: 0.7941 with params {'alpha': 0.5, 'covariance_type': 'full', 'min_df': 1, 'n_components': 50}
Testing parameters: {'alpha': 0.5, 'covariance_type': 'full', 'min_df': 1, 'n_components': 100}
Testing parameters: {'alpha': 0.5, 'covariance_type': 'full', 'min_df': 1, 'n_components': 150}
Testing parameters: {'alpha': 0.5, 'covariance_type': 'full', 'min_df': 1, 'n_components': 200}
Testing parameters: {'alpha': 0.5, 'covariance_type': 'full', 'min_df': 2, 'n_components': 50}
Testing parameters: {'alpha': 0.5, 'covariance_type': 'full', 'min_df': 2, 'n_components': 100}
Testing parameters: {'alpha': 0.5, 'covariance_type': 'full', 'min_df': 2, 'n_components': 150}
New best F1 score: 0.8684 with params {'alpha': 0.5, 'covariance_type': 'full', 'min_df': 2, 'n_components': 150}
Testing parameters: {'alpha': 0.5, 'covariance_type': 'full', 'min_df': 2, 'n_components': 200}
Testing