In [2]:
import re
import nltk
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import ParameterGrid
from nltk.corpus import stopwords
import string

# Download necessary NLTK data
nltk.download('stopwords')

# Define stop words for preprocessing
stop_words = set(stopwords.words('english'))
trans = str.maketrans('', '', string.punctuation)

# Define the cleaning and preprocessing function
def clean(text):
    if not isinstance(text, str):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove @ mentions, URLs, numbers, and punctuation
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'http\S+|www\.\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.translate(trans)

    # Remove stop words and extra whitespace
    text = ' '.join([word for word in text.split() if word not in stop_words]).strip()

    return text

def clean_dataset(dataset):
    dataset['Phrase'] = dataset['Phrase'].apply(clean)
    return dataset

# Load and preprocess data
train_data = pd.read_csv('train.csv')
val_data = pd.read_csv('val.csv')

# Apply cleaning function
train_data = clean_dataset(train_data)
val_data = clean_dataset(val_data)

# Separate labeled data (remove rows with -100 labels)
labeled_train = train_data[train_data['Sentiment'] != -100]
X_train = labeled_train['Phrase']
y_train = labeled_train['Sentiment']

# Validation data
X_val = val_data['Phrase']
y_val = val_data['Sentiment']

# Vectorization using TF-IDF
vectorizer = TfidfVectorizer(binary=True, min_df=3, stop_words=list(stop_words))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

# Parameter grids
logreg_param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']  # 'liblinear' supports L1, 'saga' supports both
}

nb_param_grid = {
    'alpha': [0.1, 0.5, 1.0]
}

# Grid search function
def grid_search(X_train, y_train, X_val, y_val, model_type):
    best_f1 = 0
    best_params = None
    best_model = None

    if model_type == 'logistic_regression':
        param_grid = ParameterGrid(logreg_param_grid)
        for params in param_grid:
            try:
                model = LogisticRegression(max_iter=1000, random_state=42, **params)
                model.fit(X_train, y_train)
                y_pred = model.predict(X_val)
                f1 = f1_score(y_val, y_pred, average="macro")

                if f1 > best_f1:
                    best_f1 = f1
                    best_params = params
                    best_model = model
                    print(f"New best F1 for Logistic Regression: {f1:.4f} with params {best_params}")
            except Exception as e:
                print(f"Skipped parameters {params} due to error: {e}")

    elif model_type == 'naive_bayes':
        param_grid = ParameterGrid(nb_param_grid)
        for params in param_grid:
            model = MultinomialNB(**params)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)
            f1 = f1_score(y_val, y_pred, average="macro")

            if f1 > best_f1:
                best_f1 = f1
                best_params = params
                best_model = model
                print(f"New best F1 for Naive Bayes: {f1:.4f} with params {best_params}")

    print(f"Best F1 Score for {model_type}: {best_f1:.4f} with parameters: {best_params}")
    return best_model, best_f1, best_params

# Run grid search for both models
print("Starting grid search for Logistic Regression:")
best_logreg_model, best_logreg_f1, best_logreg_params = grid_search(X_train_tfidf, y_train, X_val_tfidf, y_val, 'logistic_regression')

print("\nStarting grid search for Naive Bayes:")
best_nb_model, best_nb_f1, best_nb_params = grid_search(X_train_tfidf, y_train, X_val_tfidf, y_val, 'naive_bayes')

# Compare results and select the best model
if best_logreg_f1 > best_nb_f1:
    print("\nLogistic Regression performed best.")
    print(f"Best parameters: {best_logreg_params}")
    print(f"Best F1 Score: {best_logreg_f1:.4f}")
else:
    print("\nNaive Bayes performed best.")
    print(f"Best parameters: {best_nb_params}")
    print(f"Best F1 Score: {best_nb_f1:.4f}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Starting grid search for Logistic Regression:
New best F1 for Logistic Regression: 0.7644 with params {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
New best F1 for Logistic Regression: 0.8670 with params {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
New best F1 for Logistic Regression: 0.8731 with params {'C': 0.1, 'penalty': 'l2', 'solver': 'saga'}
New best F1 for Logistic Regression: 0.8788 with params {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
New best F1 for Logistic Regression: 0.8930 with params {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
Best F1 Score for logistic_regression: 0.8930 with parameters: {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}

Starting grid search for Naive Bayes:
New best F1 for Naive Bayes: 0.8913 with params {'alpha': 0.1}
New best F1 for Naive Bayes: 0.8937 with params {'alpha': 0.5}
Best F1 Score for naive_bayes: 0.8937 with parameters: {'alpha': 0.5}

Naive Bayes performed best.
Best parameters: {'alpha': 0.5}
Best F1 Score: 0.893

In [2]:
import re
import nltk
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import StandardScaler
from nltk.corpus import stopwords
import string

# Download necessary NLTK data
nltk.download('stopwords')

# Define stop words for preprocessing
stop_words = set(stopwords.words('english'))
trans = str.maketrans('', '', string.punctuation)

# Define the cleaning and preprocessing function
def clean(text):
    if not isinstance(text, str):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove @ mentions, URLs, numbers, and punctuation
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'http\S+|www\.\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.translate(trans)

    # Remove stop words and extra whitespace
    text = ' '.join([word for word in text.split() if word not in stop_words]).strip()

    return text

def clean_dataset(dataset):
    dataset['Phrase'] = dataset['Phrase'].apply(clean)
    return dataset

# Load and preprocess data
train_data = pd.read_csv('train.csv')
val_data = pd.read_csv('val.csv')

# Apply cleaning function
train_data = clean_dataset(train_data)
val_data = clean_dataset(val_data)

# Separate labeled data (remove rows with -100 labels)
labeled_train = train_data[train_data['Sentiment'] != -100]
X_train = labeled_train['Phrase']
y_train = labeled_train['Sentiment']

# Validation data
X_val = val_data['Phrase']
y_val = val_data['Sentiment']

# Define parameter grids for each model
vectorizer_param_grid = {
    'min_df': [1, 3, 5],
    'ngram_range': [(1, 1), (1, 2)]
}

logreg_param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l2'],  # Only 'l2' to avoid compatibility issues
    'solver': ['liblinear']  # 'liblinear' works well with l2 penalty
}

nb_param_grid = {
    'alpha': [0.1, 0.5, 1.0]
}

# Function to evaluate parameter combinations for a given model
def evaluate_model(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    val_f1 = f1_score(y_val, y_pred, average="macro")
    return val_f1

# Grid search for Logistic Regression
def grid_search_logistic_regression(X_train, y_train, X_val, y_val):
    best_f1 = 0
    best_params = None

    for vec_params in ParameterGrid(vectorizer_param_grid):
        vectorizer = TfidfVectorizer(binary=True, **vec_params, stop_words=list(stop_words))
        X_train_tfidf = vectorizer.fit_transform(X_train)
        X_val_tfidf = vectorizer.transform(X_val)

        # Standardize features
        scaler = StandardScaler(with_mean=False)
        X_train_scaled = scaler.fit_transform(X_train_tfidf)
        X_val_scaled = scaler.transform(X_val_tfidf)

        for logreg_params in ParameterGrid(logreg_param_grid):
            model = LogisticRegression(max_iter=500, random_state=42, **logreg_params)
            val_f1 = evaluate_model(model, X_train_scaled, y_train, X_val_scaled, y_val)

            if val_f1 > best_f1:
                best_f1 = val_f1
                best_params = {'vectorizer': vec_params, 'logreg': logreg_params}
                print(f"New best F1 for Logistic Regression: {val_f1:.4f} with params {best_params}")

    print(f"Best F1 Score for Logistic Regression: {best_f1:.4f} with parameters: {best_params}")
    return best_f1, best_params

# Grid search for Naive Bayes
def grid_search_naive_bayes(X_train, y_train, X_val, y_val):
    best_f1 = 0
    best_params = None

    for vec_params in ParameterGrid(vectorizer_param_grid):
        vectorizer = TfidfVectorizer(binary=True, **vec_params, stop_words=list(stop_words))
        X_train_tfidf = vectorizer.fit_transform(X_train)
        X_val_tfidf = vectorizer.transform(X_val)

        for nb_params in ParameterGrid(nb_param_grid):
            model = MultinomialNB(**nb_params)
            val_f1 = evaluate_model(model, X_train_tfidf, y_train, X_val_tfidf, y_val)

            if val_f1 > best_f1:
                best_f1 = val_f1
                best_params = {'vectorizer': vec_params, 'nb': nb_params}
                print(f"New best F1 for Naive Bayes: {val_f1:.4f} with params {best_params}")

    print(f"Best F1 Score for Naive Bayes: {best_f1:.4f} with parameters: {best_params}")
    return best_f1, best_params

# Run grid search for both models
print("Starting grid search for Logistic Regression:")
best_logreg_f1, best_logreg_params = grid_search_logistic_regression(X_train, y_train, X_val, y_val)

print("\nStarting grid search for Naive Bayes:")
best_nb_f1, best_nb_params = grid_search_naive_bayes(X_train, y_train, X_val, y_val)

# Compare results and select the best model
if best_logreg_f1 > best_nb_f1:
    print("\nLogistic Regression performed best.")
    print(f"Best parameters: {best_logreg_params}")
    print(f"Best F1 Score: {best_logreg_f1:.4f}")
else:
    print("\nNaive Bayes performed best.")
    print(f"Best parameters: {best_nb_params}")
    print(f"Best F1 Score: {best_nb_f1:.4f}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Starting grid search for Logistic Regression:
New best F1 for Logistic Regression: 0.8436 with params {'vectorizer': {'min_df': 1, 'ngram_range': (1, 1)}, 'logreg': {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}}
New best F1 for Logistic Regression: 0.8891 with params {'vectorizer': {'min_df': 1, 'ngram_range': (1, 2)}, 'logreg': {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}}
Best F1 Score for Logistic Regression: 0.8891 with parameters: {'vectorizer': {'min_df': 1, 'ngram_range': (1, 2)}, 'logreg': {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}}

Starting grid search for Naive Bayes:
New best F1 for Naive Bayes: 0.8932 with params {'vectorizer': {'min_df': 1, 'ngram_range': (1, 1)}, 'nb': {'alpha': 0.1}}
New best F1 for Naive Bayes: 0.8965 with params {'vectorizer': {'min_df': 1, 'ngram_range': (1, 1)}, 'nb': {'alpha': 0.5}}
New best F1 for Naive Bayes: 0.9229 with params {'vectorizer': {'min_df': 1, 'ngram_range': (1, 2)}, 'nb': {'alpha': 0.1}}
Best F1 Score for Naive Ba

In [3]:
import re
import nltk
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, accuracy_score
from nltk.corpus import stopwords
import string

# Download necessary NLTK data
nltk.download('stopwords')

# Define stop words for preprocessing
stop_words = set(stopwords.words('english'))
trans = str.maketrans('', '', string.punctuation)

# Define the cleaning and preprocessing function
def clean(text):
    if not isinstance(text, str):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove @ mentions, URLs, numbers, and punctuation
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'http\S+|www\.\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.translate(trans)

    # Remove stop words and extra whitespace
    text = ' '.join([word for word in text.split() if word not in stop_words]).strip()

    return text

def clean_dataset(dataset):
    dataset['Phrase'] = dataset['Phrase'].apply(clean)
    return dataset

# Load and preprocess data
train_data = pd.read_csv('train.csv')
val_data = pd.read_csv('val.csv')

# Apply cleaning function
train_data = clean_dataset(train_data)
val_data = clean_dataset(val_data)

# Separate labeled data (remove rows with -100 labels)
labeled_train = train_data[train_data['Sentiment'] != -100]
X_train = labeled_train['Phrase']
y_train = labeled_train['Sentiment']

# Validation data
X_val = val_data['Phrase']
y_val = val_data['Sentiment']

# Use the best parameters for TF-IDF vectorizer and Naive Bayes
vectorizer = TfidfVectorizer(binary=True, min_df=1, ngram_range=(1, 2), stop_words=list(stop_words))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

# Train Naive Bayes model with the best alpha
model = MultinomialNB(alpha=0.1)
model.fit(X_train_tfidf, y_train)

# Make predictions on the validation set
y_pred = model.predict(X_val_tfidf)

# Evaluate the model
val_accuracy = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred, average="macro")

print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"F1 Score on Validation Set: {f1:.4f}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Validation Accuracy: 0.9229
F1 Score on Validation Set: 0.9229
